1 /// @file htslib/sam.h
2 /// High-level SAM/BAM/CRAM sequence file operations.
3 /*
4     Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd.
5     Copyright (C) 2010, 2012, 2013 Broad Institute.
6 
7     Author: Heng Li <lh3@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 #ifndef HTSLIB_SAM_H
28 #define HTSLIB_SAM_H
29 
30 #include <errno.h>
31 #include <stdint.h>
32 #include <sys/types.h>
33 #include "hts.h"
34 #include "hts_endian.h"
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 /// Highest SAM format version supported by this library
41 #define SAM_FORMAT_VERSION "1.6"
42 
43 /***************************
44  *** SAM/BAM/CRAM header ***
45  ***************************/
46 
47 /*! @typedef
48  * @abstract Header extension structure, grouping a collection
49  *  of hash tables that contain the parsed header data.
50  */
51 
52 typedef struct sam_hrecs_t sam_hrecs_t;
53 
54 /*! @typedef
55  @abstract Structure for the alignment header.
56  @field n_targets   number of reference sequences
57  @field l_text      length of the plain text in the header (may be zero if
58                     the header has been edited)
59  @field target_len  lengths of the reference sequences
60  @field target_name names of the reference sequences
61  @field text        plain text (may be NULL if the header has been edited)
62  @field sdict       header dictionary
63  @field hrecs       pointer to the extended header struct (internal use only)
64  @field ref_count   reference count
65 
66  @note The text and l_text fields are included for backwards compatibility.
67  These fields may be set to NULL and zero respectively as a side-effect
68  of calling some header API functions.  New code that needs to access the
69  header text should use the sam_hdr_str() and sam_hdr_length() functions
70  instead of these fields.
71  */
72 
73 typedef struct sam_hdr_t {
74     int32_t n_targets, ignore_sam_err;
75     size_t l_text;
76     uint32_t *target_len;
77     const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead");
78     char **target_name;
79     char *text;
80     void *sdict;
81     sam_hrecs_t *hrecs;
82     uint32_t ref_count;
83 } sam_hdr_t;
84 
85 /*! @typedef
86  * @abstract Old name for compatibility with existing code.
87  */
88 typedef sam_hdr_t bam_hdr_t;
89 
90 /****************************
91  *** CIGAR related macros ***
92  ****************************/
93 
94 #define BAM_CMATCH      0
95 #define BAM_CINS        1
96 #define BAM_CDEL        2
97 #define BAM_CREF_SKIP   3
98 #define BAM_CSOFT_CLIP  4
99 #define BAM_CHARD_CLIP  5
100 #define BAM_CPAD        6
101 #define BAM_CEQUAL      7
102 #define BAM_CDIFF       8
103 #define BAM_CBACK       9
104 
105 #define BAM_CIGAR_STR   "MIDNSHP=XB"
106 #define BAM_CIGAR_SHIFT 4
107 #define BAM_CIGAR_MASK  0xf
108 #define BAM_CIGAR_TYPE  0x3C1A7
109 
110 /*! @abstract Table for converting a CIGAR operator character to BAM_CMATCH etc.
111 Result is operator code or -1. Be sure to cast the index if it is a plain char:
112     int op = bam_cigar_table[(unsigned char) ch];
113 */
114 extern const int8_t bam_cigar_table[256];
115 
116 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
117 #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
118 // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that
119 // the array look-up will not fall off the end.  '?' is chosen as the
120 // padding character so it's easy to spot if one is emitted, and will
121 // result in a parsing failure (in sam_parse1(), at least) if read.
122 #define bam_cigar_opchr(c) (BAM_CIGAR_STR "??????" [bam_cigar_op(c)])
123 #define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
124 
125 /* bam_cigar_type returns a bit flag with:
126  *   bit 1 set if the cigar operation consumes the query
127  *   bit 2 set if the cigar operation consumes the reference
128  *
129  * For reference, the unobfuscated truth table for this function is:
130  * BAM_CIGAR_TYPE  QUERY  REFERENCE
131  * --------------------------------
132  * BAM_CMATCH      1      1
133  * BAM_CINS        1      0
134  * BAM_CDEL        0      1
135  * BAM_CREF_SKIP   0      1
136  * BAM_CSOFT_CLIP  1      0
137  * BAM_CHARD_CLIP  0      0
138  * BAM_CPAD        0      0
139  * BAM_CEQUAL      1      1
140  * BAM_CDIFF       1      1
141  * BAM_CBACK       0      0
142  * --------------------------------
143  */
144 #define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
145 
146 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
147 #define BAM_FPAIRED        1
148 /*! @abstract the read is mapped in a proper pair */
149 #define BAM_FPROPER_PAIR   2
150 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
151 #define BAM_FUNMAP         4
152 /*! @abstract the mate is unmapped */
153 #define BAM_FMUNMAP        8
154 /*! @abstract the read is mapped to the reverse strand */
155 #define BAM_FREVERSE      16
156 /*! @abstract the mate is mapped to the reverse strand */
157 #define BAM_FMREVERSE     32
158 /*! @abstract this is read1 */
159 #define BAM_FREAD1        64
160 /*! @abstract this is read2 */
161 #define BAM_FREAD2       128
162 /*! @abstract not primary alignment */
163 #define BAM_FSECONDARY   256
164 /*! @abstract QC failure */
165 #define BAM_FQCFAIL      512
166 /*! @abstract optical or PCR duplicate */
167 #define BAM_FDUP        1024
168 /*! @abstract supplementary alignment */
169 #define BAM_FSUPPLEMENTARY 2048
170 
171 /*************************
172  *** Alignment records ***
173  *************************/
174 
175 /*
176  * Assumptions made here.  While pos can be 64-bit, no sequence
177  * itself is that long, but due to ref skip CIGAR fields it
178  * may span more than that.  (CIGAR itself is 28-bit len + 4 bit
179  * type, but in theory we can combine multiples together.)
180  *
181  * Mate position and insert size also need to be 64-bit, but
182  * we won't accept more than 32-bit for tid.
183  *
184  * The bam_core_t structure is the *in memory* layout and not
185  * the same as the on-disk format.  64-bit changes here permit
186  * SAM to work with very long chromosomes and permit BAM and CRAM
187  * to seamlessly update in the future without further API/ABI
188  * revisions.
189  */
190 
191 /*! @typedef
192  @abstract Structure for core alignment information.
193  @field  pos     0-based leftmost coordinate
194  @field  tid     chromosome ID, defined by sam_hdr_t
195  @field  bin     bin calculated by bam_reg2bin()
196  @field  qual    mapping quality
197  @field  l_extranul length of extra NULs between qname & cigar (for alignment)
198  @field  flag    bitwise flag
199  @field  l_qname length of the query name
200  @field  n_cigar number of CIGAR operations
201  @field  l_qseq  length of the query sequence (read)
202  @field  mtid    chromosome ID of next read in template, defined by sam_hdr_t
203  @field  mpos    0-based leftmost coordinate of next read in template
204  @field  isize   observed template length ("insert size")
205  */
206 typedef struct bam1_core_t {
207     hts_pos_t pos;
208     int32_t tid;
209     uint16_t bin; // NB: invalid on 64-bit pos
210     uint8_t qual;
211     uint8_t l_extranul;
212     uint16_t flag;
213     uint16_t l_qname;
214     uint32_t n_cigar;
215     int32_t l_qseq;
216     int32_t mtid;
217     hts_pos_t mpos;
218     hts_pos_t isize;
219 } bam1_core_t;
220 
221 /*! @typedef
222  @abstract Structure for one alignment.
223  @field  core       core information about the alignment
224  @field  id
225  @field  data       all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
226  @field  l_data     current length of bam1_t::data
227  @field  m_data     maximum length of bam1_t::data
228  @field  mempolicy  memory handling policy, see bam_set_mempolicy()
229 
230  @discussion Notes:
231 
232  1. The data blob should be accessed using bam_get_qname, bam_get_cigar,
233     bam_get_seq, bam_get_qual and bam_get_aux macros.  These returns pointers
234     to the start of each type of data.
235  2. qname is terminated by one to four NULs, so that the following
236     cigar data is 32-bit aligned; core.l_qname includes these trailing NULs,
237     while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3).
238  3. Cigar data is encoded 4 bytes per CIGAR operation.
239     See the bam_cigar_* macros for manipulation.
240  4. seq is nibble-encoded according to bam_nt16_table.
241     See the bam_seqi macro for retrieving individual bases.
242  5. Per base qualities are stored in the Phred scale with no +33 offset.
243     Ie as per the BAM specification and not the SAM ASCII printable method.
244  */
245 typedef struct bam1_t {
246     bam1_core_t core;
247     uint64_t id;
248     uint8_t *data;
249     int l_data;
250     uint32_t m_data;
251     uint32_t mempolicy:2, :30 /* Reserved */;
252 } bam1_t;
253 
254 /*! @function
255  @abstract  Get whether the query is on the reverse strand
256  @param  b  pointer to an alignment
257  @return    boolean true if query is on the reverse strand
258  */
259 #define bam_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0)
260 /*! @function
261  @abstract  Get whether the query's mate is on the reverse strand
262  @param  b  pointer to an alignment
263  @return    boolean true if query's mate on the reverse strand
264  */
265 #define bam_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
266 /*! @function
267  @abstract  Get the name of the query
268  @param  b  pointer to an alignment
269  @return    pointer to the name string, null terminated
270  */
271 #define bam_get_qname(b) ((char*)(b)->data)
272 /*! @function
273  @abstract  Get the CIGAR array
274  @param  b  pointer to an alignment
275  @return    pointer to the CIGAR array
276 
277  @discussion In the CIGAR array, each element is a 32-bit integer. The
278  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
279  length of a CIGAR.
280  */
281 #define bam_get_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
282 /*! @function
283  @abstract  Get query sequence
284  @param  b  pointer to an alignment
285  @return    pointer to sequence
286 
287  @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
288  8 for T and 15 for N. Two bases are packed in one byte with the base
289  at the higher 4 bits having smaller coordinate on the read. It is
290  recommended to use bam_seqi() macro to get the base.
291  */
292 #define bam_get_seq(b)   ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname)
293 /*! @function
294  @abstract  Get query quality
295  @param  b  pointer to an alignment
296  @return    pointer to quality string
297  */
298 #define bam_get_qual(b)  ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
299 /*! @function
300  @abstract  Get auxiliary data
301  @param  b  pointer to an alignment
302  @return    pointer to the concatenated auxiliary data
303  */
304 #define bam_get_aux(b)   ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq)
305 /*! @function
306  @abstract  Get length of auxiliary data
307  @param  b  pointer to an alignment
308  @return    length of the concatenated auxiliary data
309  */
310 #define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1))
311 /*! @function
312  @abstract  Get a base on read
313  @param  s  Query sequence returned by bam_get_seq()
314  @param  i  The i-th position, 0-based
315  @return    4-bit integer representing the base.
316  */
317 #define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
318 /*!
319  @abstract  Modifies a single base in the bam structure.
320  @param s   Query sequence returned by bam_get_seq()
321  @param i   The i-th position, 0-based
322  @param b   Base in nt16 nomenclature (see seq_nt16_table)
323 */
324 #define bam_set_seqi(s,i,b) ((s)[(i)>>1] = ((s)[(i)>>1] & (0xf0 >> ((~(i)&1)<<2))) | ((b)<<((~(i)&1)<<2)))
325 
326 /**************************
327  *** Exported functions ***
328  **************************/
329 
330 /***************
331  *** BAM I/O ***
332  ***************/
333 
334 /* Header */
335 
336 /// Generates a new unpopulated header structure.
337 /*!
338  *
339  * @return  A valid pointer to new header on success, NULL on failure
340  *
341  * The sam_hdr_t struct returned by a successful call should be freed
342  * via sam_hdr_destroy() when it is no longer needed.
343  */
344 HTSLIB_EXPORT
345 sam_hdr_t *sam_hdr_init(void);
346 
347 /// Read the header from a BAM compressed file.
348 /*!
349  * @param fp  File pointer
350  * @return    A valid pointer to new header on success, NULL on failure
351  *
352  * This function only works with BAM files.  It is usually better to use
353  * sam_hdr_read(), which works on SAM, BAM and CRAM files.
354  *
355  * The sam_hdr_t struct returned by a successful call should be freed
356  * via sam_hdr_destroy() when it is no longer needed.
357  */
358 HTSLIB_EXPORT
359 sam_hdr_t *bam_hdr_read(BGZF *fp);
360 
361 /// Writes the header to a BAM file.
362 /*!
363  * @param fp  File pointer
364  * @param h   Header pointer
365  * @return    0 on success, -1 on failure
366  *
367  * This function only works with BAM files.  Use sam_hdr_write() to
368  * write in any of the SAM, BAM or CRAM formats.
369  */
370 HTSLIB_EXPORT
371 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) HTS_RESULT_USED;
372 
373 /*!
374  * Frees the resources associated with a header.
375  */
376 HTSLIB_EXPORT
377 void sam_hdr_destroy(sam_hdr_t *h);
378 
379 /// Duplicate a header structure.
380 /*!
381  * @return  A valid pointer to new header on success, NULL on failure
382  *
383  * The sam_hdr_t struct returned by a successful call should be freed
384  * via sam_hdr_destroy() when it is no longer needed.
385  */
386 HTSLIB_EXPORT
387 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0);
388 
389 /*!
390  * @abstract Old names for compatibility with existing code.
391  */
bam_hdr_init(void)392 static inline sam_hdr_t *bam_hdr_init(void) { return sam_hdr_init(); }
bam_hdr_destroy(sam_hdr_t * h)393 static inline void bam_hdr_destroy(sam_hdr_t *h) { sam_hdr_destroy(h); }
bam_hdr_dup(const sam_hdr_t * h0)394 static inline sam_hdr_t *bam_hdr_dup(const sam_hdr_t *h0) { return sam_hdr_dup(h0); }
395 
396 typedef htsFile samFile;
397 
398 /// Create a header from existing text.
399 /*!
400  * @param l_text    Length of text
401  * @param text      Header text
402  * @return A populated sam_hdr_t structure on success; NULL on failure.
403  * @note The text field of the returned header will be NULL, and the l_text
404  * field will be zero.
405  *
406  * The sam_hdr_t struct returned by a successful call should be freed
407  * via sam_hdr_destroy() when it is no longer needed.
408  */
409 HTSLIB_EXPORT
410 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text);
411 
412 /// Read a header from a SAM, BAM or CRAM file.
413 /*!
414  * @param fp    Pointer to a SAM, BAM or CRAM file handle
415  * @return  A populated sam_hdr_t struct on success; NULL on failure.
416  *
417  * The sam_hdr_t struct returned by a successful call should be freed
418  * via sam_hdr_destroy() when it is no longer needed.
419  */
420 HTSLIB_EXPORT
421 sam_hdr_t *sam_hdr_read(samFile *fp);
422 
423 /// Write a header to a SAM, BAM or CRAM file.
424 /*!
425  * @param fp    SAM, BAM or CRAM file header
426  * @param h     Header structure to write
427  * @return  0 on success; -1 on failure
428  */
429 HTSLIB_EXPORT
430 int sam_hdr_write(samFile *fp, const sam_hdr_t *h) HTS_RESULT_USED;
431 
432 /// Returns the current length of the header text.
433 /*!
434  * @return  >= 0 on success, SIZE_MAX on failure
435  */
436 HTSLIB_EXPORT
437 size_t sam_hdr_length(sam_hdr_t *h);
438 
439 /// Returns the text representation of the header.
440 /*!
441  * @return  valid char pointer on success, NULL on failure
442  *
443  * The returned string is part of the header structure.  It will remain
444  * valid until a call to a header API function causes the string to be
445  * invalidated, or the header is destroyed.
446  *
447  * The caller should not attempt to free or realloc this pointer.
448  */
449 HTSLIB_EXPORT
450 const char *sam_hdr_str(sam_hdr_t *h);
451 
452 /// Returns the number of references in the header.
453 /*!
454  * @return  >= 0 on success, -1 on failure
455  */
456 HTSLIB_EXPORT
457 int sam_hdr_nref(const sam_hdr_t *h);
458 
459 /* ==== Line level methods ==== */
460 
461 /// Add formatted lines to an existing header.
462 /*!
463  * @param lines  Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
464  *               optional new-line. If it contains more than 1 line then
465  *               multiple lines will be added in order
466  * @param len    The maximum length of lines (if an early NUL is not
467  *               encountered). len may be 0 if unknown, in which case
468  *               lines must be NUL-terminated
469  * @return       0 on success, -1 on failure
470  *
471  * The lines will be appended to the end of the existing header
472  * (apart from HD, which always comes first).
473  */
474 HTSLIB_EXPORT
475 int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len);
476 
477 /// Adds a single line to an existing header.
478 /*!
479  * Specify type and one or more key,value pairs, ending with the NULL key.
480  * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
481  *
482  * @param type  Type of the added line. Eg. "SQ"
483  * @return      0 on success, -1 on failure
484  *
485  * The new line will be added immediately after any others of the same
486  * type, or at the end of the existing header if no lines of the
487  * given type currently exist.  The exception is HD lines, which always
488  * come first.  If an HD line already exists, it will be replaced.
489  */
490 HTSLIB_EXPORT
491 int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...);
492 
493 /// Returns a complete line of formatted text for a given type and ID.
494 /*!
495  * @param type      Type of the searched line. Eg. "SQ"
496  * @param ID_key    Tag key defining the line. Eg. "SN"
497  * @param ID_value  Tag value associated with the key above. Eg. "ref1"
498  * @param ks        kstring to hold the result
499  * @return          0 on success;
500  *                 -1 if no matching line is found
501  *                 -2 on other failures
502  *
503  * Puts a complete line of formatted text for a specific header type/ID
504  * combination into @p ks. If ID_key is NULL then it returns the first line of
505  * the specified type.
506  *
507  * Any existing content in @p ks will be overwritten.
508  */
509 HTSLIB_EXPORT
510 int sam_hdr_find_line_id(sam_hdr_t *h, const char *type,
511                       const char *ID_key, const char *ID_val, kstring_t *ks);
512 
513 /// Returns a complete line of formatted text for a given type and index.
514 /*!
515  * @param type      Type of the searched line. Eg. "SQ"
516  * @param position  Index in lines of this type (zero-based)
517  * @param ks        kstring to hold the result
518  * @return          0 on success;
519  *                 -1 if no matching line is found
520  *                 -2 on other failures
521  *
522  * Puts a complete line of formatted text for a specific line into @p ks.
523  * The header line is selected using the @p type and @p position parameters.
524  *
525  * Any existing content in @p ks will be overwritten.
526  */
527 HTSLIB_EXPORT
528 int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type,
529                           int pos, kstring_t *ks);
530 
531 /// Remove a line with given type / id from a header
532 /*!
533  * @param type      Type of the searched line. Eg. "SQ"
534  * @param ID_key    Tag key defining the line. Eg. "SN"
535  * @param ID_value  Tag value associated with the key above. Eg. "ref1"
536  * @return          0 on success, -1 on error
537  *
538  * Remove a line from the header by specifying a tag:value that uniquely
539  * identifies the line, i.e. the @SQ line containing "SN:ref1".
540  *
541  * \@SQ line is uniquely identified by the SN tag.
542  * \@RG line is uniquely identified by the ID tag.
543  * \@PG line is uniquely identified by the ID tag.
544  * Eg. sam_hdr_remove_line_id(h, "SQ", "SN", "ref1")
545  *
546  * If no key:value pair is specified, the type MUST be followed by a NULL argument and
547  * the first line of the type will be removed, if any.
548  * Eg. sam_hdr_remove_line_id(h, "SQ", NULL, NULL)
549  *
550  * @note Removing \@PG lines is currently unsupported.
551  */
552 HTSLIB_EXPORT
553 int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value);
554 
555 /// Remove nth line of a given type from a header
556 /*!
557  * @param type     Type of the searched line. Eg. "SQ"
558  * @param position Index in lines of this type (zero-based). E.g. 3
559  * @return         0 on success, -1 on error
560  *
561  * Remove a line from the header by specifying the position in the type
562  * group, i.e. 3rd @SQ line.
563  */
564 HTSLIB_EXPORT
565 int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position);
566 
567 /// Add or update tag key,value pairs in a header line.
568 /*!
569  * @param type      Type of the searched line. Eg. "SQ"
570  * @param ID_key    Tag key defining the line. Eg. "SN"
571  * @param ID_value  Tag value associated with the key above. Eg. "ref1"
572  * @return          0 on success, -1 on error
573  *
574  * Adds or updates tag key,value pairs in a header line.
575  * Eg. for adding M5 tags to @SQ lines or updating sort order for the
576  * @HD line.
577  *
578  * Specify multiple key,value pairs ending in NULL. Eg.
579  * sam_hdr_update_line(h, "RG", "ID", "rg1", "DS", "description", "PG", "samtools", NULL)
580  *
581  * Attempting to update the record name (i.e. @SQ SN or @RG ID) will
582  * work as long as the new name is not already in use, however doing this
583  * on a file opened for reading may produce unexpected results.
584  *
585  * Renaming an @RG record in this way will only change the header.  Alignment
586  * records written later will not be updated automatically even if they
587  * reference the old read group name.
588  *
589  * Attempting to change an @PG ID tag is not permitted.
590  */
591 HTSLIB_EXPORT
592 int sam_hdr_update_line(sam_hdr_t *h, const char *type,
593         const char *ID_key, const char *ID_value, ...);
594 
595 /// Remove all lines of a given type from a header, except the one matching an ID
596 /*!
597  * @param type      Type of the searched line. Eg. "SQ"
598  * @param ID_key    Tag key defining the line. Eg. "SN"
599  * @param ID_value  Tag value associated with the key above. Eg. "ref1"
600  * @return          0 on success, -1 on failure
601  *
602  * Remove all lines of type <type> from the header, except the one
603  * specified by tag:value, i.e. the @SQ line containing "SN:ref1".
604  *
605  * If no line matches the key:value ID, all lines of the given type are removed.
606  * To remove all lines of a given type, use NULL for both ID_key and ID_value.
607  */
608 HTSLIB_EXPORT
609 int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value);
610 
611 /// Remove header lines of a given type, except those in a given ID set
612 /*!
613  * @param type  Type of the searched line. Eg. "RG"
614  * @param id    Tag key defining the line. Eg. "ID"
615  * @param rh    Hash set initialised by the caller with the values to be kept.
616  *              See description for how to create this. If @p rh is NULL, all
617  *              lines of this type will be removed.
618  * @return      0 on success, -1 on failure
619  *
620  * Remove all lines of type @p type from the header, except the ones
621  * specified in the hash set @p rh. If @p rh is NULL, all lines of
622  * this type will be removed.
623  * Declaration of @p rh is done using KHASH_SET_INIT_STR macro. Eg.
624  * @code{.c}
625  *              #include "htslib/khash.h"
626  *              KHASH_SET_INIT_STR(keep)
627  *              typedef khash_t(keep) *keephash_t;
628  *
629  *              void your_method() {
630  *                  samFile *sf = sam_open("alignment.bam", "r");
631  *                  sam_hdr_t *h = sam_hdr_read(sf);
632  *                  keephash_t rh = kh_init(keep);
633  *                  int ret = 0;
634  *                  kh_put(keep, rh, strdup("chr2"), &ret);
635  *                  kh_put(keep, rh, strdup("chr3"), &ret);
636  *                  if (sam_hdr_remove_lines(h, "SQ", "SN", rh) == -1)
637  *                      fprintf(stderr, "Error removing lines\n");
638  *                  khint_t k;
639  *                  for (k = 0; k < kh_end(rh); ++k)
640  *                     if (kh_exist(rh, k)) free((char*)kh_key(rh, k));
641  *                  kh_destroy(keep, rh);
642  *                  sam_hdr_destroy(h);
643  *                  sam_close(sf);
644  *              }
645  * @endcode
646  *
647  */
648 HTSLIB_EXPORT
649 int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh);
650 
651 /// Count the number of lines for a given header type
652 /*!
653  * @param h     BAM header
654  * @param type  Header type to count. Eg. "RG"
655  * @return  Number of lines of this type on success; -1 on failure
656  */
657 HTSLIB_EXPORT
658 int sam_hdr_count_lines(sam_hdr_t *h, const char *type);
659 
660 /// Index of the line for the types that have dedicated look-up tables (SQ, RG, PG)
661 /*!
662  * @param h     BAM header
663  * @param type  Type of the searched line. Eg. "RG"
664  * @param key   The value of the identifying key. Eg. "rg1"
665  * @return  0-based index on success; -1 if line does not exist; -2 on failure
666  */
667 HTSLIB_EXPORT
668 int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key);
669 
670 /// Id key of the line for the types that have dedicated look-up tables (SQ, RG, PG)
671 /*!
672  * @param h     BAM header
673  * @param type  Type of the searched line. Eg. "RG"
674  * @param pos   Zero-based index inside the type group. Eg. 2 (for the third RG line)
675  * @return  Valid key string on success; NULL on failure
676  */
677 HTSLIB_EXPORT
678 const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos);
679 
680 /* ==== Key:val level methods ==== */
681 
682 /// Return the value associated with a key for a header line identified by ID_key:ID_val
683 /*!
684  * @param type      Type of the line to which the tag belongs. Eg. "SQ"
685  * @param ID_key    Tag key defining the line. Eg. "SN". Can be NULL, if looking for the first line.
686  * @param ID_value  Tag value associated with the key above. Eg. "ref1". Can be NULL, if ID_key is NULL.
687  * @param key       Key of the searched tag. Eg. "LN"
688  * @param ks        kstring where the value will be written
689  * @return          0 on success
690  *                 -1 if the requested tag does not exist
691  *                 -2 on other errors
692  *
693  * Looks for a specific key in a single SAM header line and writes the
694  * associated value into @p ks.  The header line is selected using the ID_key
695  * and ID_value parameters.  Any pre-existing content in @p ks will be
696  * overwritten.
697  */
698 HTSLIB_EXPORT
699 int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks);
700 
701 /// Return the value associated with a key for a header line identified by position
702 /*!
703  * @param type      Type of the line to which the tag belongs. Eg. "SQ"
704  * @param position  Index in lines of this type (zero-based). E.g. 3
705  * @param key       Key of the searched tag. Eg. "LN"
706  * @param ks        kstring where the value will be written
707  * @return          0 on success
708  *                 -1 if the requested tag does not exist
709  *                 -2 on other errors
710  *
711  * Looks for a specific key in a single SAM header line and writes the
712  * associated value into @p ks.  The header line is selected using the @p type
713  * and @p position parameters.  Any pre-existing content in @p ks will be
714  * overwritten.
715  */
716 HTSLIB_EXPORT
717 int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks);
718 
719 /// Remove the key from the line identified by type, ID_key and ID_value.
720 /*!
721  * @param type      Type of the line to which the tag belongs. Eg. "SQ"
722  * @param ID_key    Tag key defining the line. Eg. "SN"
723  * @param ID_value  Tag value associated with the key above. Eg. "ref1"
724  * @param key       Key of the targeted tag. Eg. "M5"
725  * @return          1 if the key was removed; 0 if it was not present; -1 on error
726  */
727 HTSLIB_EXPORT
728 int sam_hdr_remove_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key);
729 
730 /// Get the target id for a given reference sequence name
731 /*!
732  * @param ref  Reference name
733  * @return     Positive value on success,
734  *             -1 if unknown reference,
735  *             -2 if the header could not be parsed
736  *
737  * Looks up a reference sequence by name in the reference hash table
738  * and returns the numerical target id.
739  */
740 HTSLIB_EXPORT
741 int sam_hdr_name2tid(sam_hdr_t *h, const char *ref);
742 
743 /// Get the reference sequence name from a target index
744 /*!
745  * @param tid  Target index
746  * @return     Valid reference name on success, NULL on failure
747  *
748  * Fetch the reference sequence name from the target name array,
749  * using the numerical target id.
750  */
751 HTSLIB_EXPORT
752 const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid);
753 
754 /// Get the reference sequence length from a target index
755 /*!
756  * @param tid  Target index
757  * @return     Strictly positive value on success, 0 on failure
758  *
759  * Fetch the reference sequence length from the target length array,
760  * using the numerical target id.
761  */
762 HTSLIB_EXPORT
763 hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid);
764 
765 /// Alias of sam_hdr_name2tid(), for backwards compatibility.
766 /*!
767  * @param ref  Reference name
768  * @return     Positive value on success,
769  *             -1 if unknown reference,
770  *             -2 if the header could not be parsed
771  */
bam_name2id(sam_hdr_t * h,const char * ref)772 static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); }
773 
774 /// Generate a unique \@PG ID: value
775 /*!
776  * @param name  Name of the program. Eg. samtools
777  * @return      Valid ID on success, NULL on failure
778  *
779  * Returns a unique ID from a base name.  The string returned will remain
780  * valid until the next call to this function, or the header is destroyed.
781  * The caller should not attempt to free() or realloc() it.
782  */
783 HTSLIB_EXPORT
784 const char *sam_hdr_pg_id(sam_hdr_t *h, const char *name);
785 
786 /// Add an \@PG line.
787 /*!
788  * @param name  Name of the program. Eg. samtools
789  * @return      0 on success, -1 on failure
790  *
791  * If we wish complete control over this use sam_hdr_add_line() directly. This
792  * function uses that, but attempts to do a lot of tedious house work for
793  * you too.
794  *
795  * - It will generate a suitable ID if the supplied one clashes.
796  * - It will generate multiple \@PG records if we have multiple PG chains.
797  *
798  * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
799  * in NULL.
800  */
801 HTSLIB_EXPORT
802 int sam_hdr_add_pg(sam_hdr_t *h, const char *name, ...);
803 
804 /*!
805  * A function to help with construction of CL tags in @PG records.
806  * Takes an argc, argv pair and returns a single space-separated string.
807  * This string should be deallocated by the calling function.
808  *
809  * @return
810  * Returns malloced char * on success;
811  *         NULL on failure
812  */
813 HTSLIB_EXPORT
814 char *stringify_argv(int argc, char *argv[]);
815 
816 /// Increments the reference count on a header
817 /*!
818  * This permits multiple files to share the same header, all calling
819  * sam_hdr_destroy when done, without causing errors for other open files.
820  */
821 HTSLIB_EXPORT
822 void sam_hdr_incr_ref(sam_hdr_t *h);
823 
824 /*
825  * Macros for changing the \@HD line. They eliminate the need to use NULL method arguments.
826  */
827 
828 /// Returns the SAM formatted text of the \@HD header line
829 #define sam_hdr_find_hd(h, ks) sam_hdr_find_line_id((h), "HD", NULL, NULL, (ks))
830 /// Returns the value associated with a given \@HD line tag
831 #define sam_hdr_find_tag_hd(h, key, ks) sam_hdr_find_tag_id((h), "HD", NULL, NULL, (key), (ks))
832 /// Adds or updates tags on the header \@HD line
833 #define sam_hdr_update_hd(h, ...) sam_hdr_update_line((h), "HD", NULL, NULL, __VA_ARGS__, NULL)
834 /// Removes the \@HD line tag with the given key
835 #define sam_hdr_remove_tag_hd(h, key) sam_hdr_remove_tag_id((h), "HD", NULL, NULL, (key))
836 
837 /* Alignment */
838 
839 /// Create a new bam1_t alignment structure
840 /**
841    @return An empty bam1_t structure on success, NULL on failure
842 
843    The bam1_t struct returned by a successful call should be freed
844    via bam_destroy1() when it is no longer needed.
845  */
846 HTSLIB_EXPORT
847 bam1_t *bam_init1(void);
848 
849 /// Destroy a bam1_t structure
850 /**
851    @param b  structure to destroy
852 
853    Does nothing if @p b is NULL.  If not, all memory associated with @p b
854    will be freed, along with the structure itself.  @p b should not be
855    accessed after calling this function.
856  */
857 HTSLIB_EXPORT
858 void bam_destroy1(bam1_t *b);
859 
860 #define BAM_USER_OWNS_STRUCT 1
861 #define BAM_USER_OWNS_DATA   2
862 
863 /// Set alignment record memory policy
864 /**
865    @param b       Alignment record
866    @param policy  Desired policy
867 
868    Allows the way HTSlib reallocates and frees bam1_t data to be
869    changed.  @policy can be set to the bitwise-or of the following
870    values:
871 
872    \li \c BAM_USER_OWNS_STRUCT
873    If this is set then bam_destroy1() will not try to free the bam1_t struct.
874 
875    \li \c BAM_USER_OWNS_DATA
876    If this is set, bam_destroy1() will not free the bam1_t::data pointer.
877    Also, functions which need to expand bam1_t::data memory will change
878    behaviour.  Instead of calling realloc() on the pointer, they will
879    allocate a new data buffer and copy any existing content in to it.
880    The existing memory will \b not be freed.  bam1_t::data will be
881    set to point to the new memory and the BAM_USER_OWNS_DATA flag will be
882    cleared.
883 
884    BAM_USER_OWNS_STRUCT allows bam_destroy1() to be called on bam1_t
885    structures that are members of an array.
886 
887    BAM_USER_OWNS_DATA can be used by applications that want more control
888    over where the variable-length parts of the bam record will be stored.
889    By preventing calls to free() and realloc(), it allows bam1_t::data
890    to hold pointers to memory that cannot be passed to those functions.
891 
892    Example:  Read a block of alignment records, storing the variable-length
893    data in a single buffer and the records in an array.  Stop when either
894    the array or the buffer is full.
895 
896    \code{.c}
897    #define MAX_RECS 1000
898    #define REC_LENGTH 400  // Average length estimate, to get buffer size
899    size_t bufsz = MAX_RECS * REC_LENGTH, nrecs, buff_used = 0;
900    bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t));
901    uint8_t *buffer = malloc(bufsz);
902    int res = 0, result = EXIT_FAILURE;
903    uint32_t new_m_data;
904 
905    if (!recs || !buffer) goto cleanup;
906    for (nrecs = 0; nrecs < MAX_RECS; nrecs++) {
907       bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA);
908 
909       // Set data pointer to unused part of buffer
910       recs[nrecs].data = &buffer[buff_used];
911 
912       // Set m_data to size of unused part of buffer.  On 64-bit platforms it
913       // will be necessary to limit this to UINT32_MAX due to the size of
914       // bam1_t::m_data (not done here as our buffer is only 400K).
915       recs[nrecs].m_data = bufsz - buff_used;
916 
917       // Read the record
918       res = sam_read1(file_handle, header, &recs[nrecs]);
919       if (res <= 0) break; // EOF or error
920 
921       // Check if the record data didn't fit - if not, stop reading
922       if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) {
923          nrecs++; // Include last record in count
924          break;
925       }
926 
927       // Adjust m_data to the space actually used.  If space is available,
928       // round up to eight bytes so the next record aligns nicely.
929       new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U);
930       if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data;
931 
932       buff_used += recs[nrecs].m_data;
933    }
934    if (res < 0) goto cleanup;
935    result = EXIT_SUCCESS;
936 
937    // ... use data ...
938 
939  cleanup:
940    for (size_t i = 0; i < nrecs; i++)
941      bam_destroy1(i);
942    free(buffer);
943    free(recs);
944 
945    \endcode
946 */
bam_set_mempolicy(bam1_t * b,uint32_t policy)947 static inline void bam_set_mempolicy(bam1_t *b, uint32_t policy) {
948     b->mempolicy = policy;
949 }
950 
951 /// Get alignment record memory policy
952 /** @param b    Alignment record
953 
954     See bam_set_mempolicy()
955  */
bam_get_mempolicy(bam1_t * b)956 static inline uint32_t bam_get_mempolicy(bam1_t *b) {
957     return b->mempolicy;
958 }
959 
960 /// Read a BAM format alignment record
961 /**
962    @param fp   BGZF file being read
963    @param b    Destination for the alignment data
964    @return number of bytes read on success
965            -1 at end of file
966            < -1 on failure
967 
968    This function can only read BAM format files.  Most code should use
969    sam_read1() instead, which can be used with BAM, SAM and CRAM formats.
970 */
971 HTSLIB_EXPORT
972 int bam_read1(BGZF *fp, bam1_t *b) HTS_RESULT_USED;
973 
974 /// Write a BAM format alignment record
975 /**
976    @param fp  BGZF file being written
977    @param b   Alignment record to write
978    @return number of bytes written on success
979            -1 on error
980 
981    This function can only write BAM format files.  Most code should use
982    sam_write1() instead, which can be used with BAM, SAM and CRAM formats.
983 */
984 HTSLIB_EXPORT
985 int bam_write1(BGZF *fp, const bam1_t *b) HTS_RESULT_USED;
986 
987 /// Copy alignment record data
988 /**
989    @param bdst  Destination alignment record
990    @param bsrc  Source alignment record
991    @return bdst on success; NULL on failure
992  */
993 HTSLIB_EXPORT
994 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED;
995 
996 /// Create a duplicate alignment record
997 /**
998    @param bsrc  Source alignment record
999    @return Pointer to a new alignment record on success; NULL on failure
1000 
1001    The bam1_t struct returned by a successful call should be freed
1002    via bam_destroy1() when it is no longer needed.
1003  */
1004 HTSLIB_EXPORT
1005 bam1_t *bam_dup1(const bam1_t *bsrc);
1006 
1007 /// Sets all components of an alignment structure
1008 /**
1009    @param bam      Target alignment structure. Must be initialized by a call to bam_init1().
1010                    The data field will be reallocated automatically as needed.
1011    @param l_qname  Length of the query name. If set to 0, the placeholder query name "*" will be used.
1012    @param qname    Query name, may be NULL if l_qname = 0
1013    @param flag     Bitwise flag, a combination of the BAM_F* constants.
1014    @param tid      Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME).
1015    @param pos      0-based leftmost coordinate.
1016    @param mapq     Mapping quality.
1017    @param n_cigar  Number of CIGAR operations.
1018    @param cigar    CIGAR data, may be NULL if n_cigar = 0.
1019    @param mtid     Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT).
1020    @param mpos     0-based leftmost coordinate of next read in template (a.k.a. PNEXT).
1021    @param isize    Observed template length ("insert size") (a.k.a. TLEN).
1022    @param l_seq    Length of the query sequence (read) and sequence quality string.
1023    @param seq      Sequence, may be NULL if l_seq = 0.
1024    @param qual     Sequence quality, may be NULL.
1025    @param l_aux    Length to be reserved for auxiliary field data, may be 0.
1026 
1027    @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure.
1028 */
1029 HTSLIB_EXPORT
1030 int bam_set1(bam1_t *bam,
1031              size_t l_qname, const char *qname,
1032              uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
1033              size_t n_cigar, const uint32_t *cigar,
1034              int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
1035              size_t l_seq, const char *seq, const char *qual,
1036              size_t l_aux);
1037 
1038 /// Calculate query length from CIGAR data
1039 /**
1040    @param n_cigar   Number of items in @p cigar
1041    @param cigar     CIGAR data
1042    @return Query length
1043 
1044    CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op
1045    where op_len is the length in bases and op is a value between 0 and 8
1046    representing one of the operations "MIDNSHP=X" (M = 0; X = 8)
1047 
1048    This function returns the sum of the lengths of the M, I, S, = and X
1049    operations in @p cigar (these are the operations that "consume" query
1050    bases).  All other operations (including invalid ones) are ignored.
1051 
1052    @note This return type of this function is hts_pos_t so that it can
1053    correctly return the length of CIGAR sequences including many long
1054    operations without overflow. However, other restrictions (notably the sizes
1055    of bam1_core_t::l_qseq and bam1_t::data) limit the maximum query sequence
1056    length supported by HTSlib to fewer than INT_MAX bases.
1057  */
1058 HTSLIB_EXPORT
1059 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar);
1060 
1061 /// Calculate reference length from CIGAR data
1062 /**
1063    @param n_cigar   Number of items in @p cigar
1064    @param cigar     CIGAR data
1065    @return Reference length
1066 
1067    CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op
1068    where op_len is the length in bases and op is a value between 0 and 8
1069    representing one of the operations "MIDNSHP=X" (M = 0; X = 8)
1070 
1071    This function returns the sum of the lengths of the M, D, N, = and X
1072    operations in @p cigar (these are the operations that "consume" reference
1073    bases).  All other operations (including invalid ones) are ignored.
1074  */
1075 HTSLIB_EXPORT
1076 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar);
1077 
1078 /*!
1079       @abstract Calculate the rightmost base position of an alignment on the
1080       reference genome.
1081 
1082       @param  b  pointer to an alignment
1083       @return    the coordinate of the first base after the alignment, 0-based
1084 
1085       @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
1086       For an unmapped read (either according to its flags or if it has no cigar
1087       string) or a read whose cigar string consumes no reference bases at all,
1088       we return b->core.pos + 1 by convention.
1089  */
1090 HTSLIB_EXPORT
1091 hts_pos_t bam_endpos(const bam1_t *b);
1092 
1093 HTSLIB_EXPORT
1094 int   bam_str2flag(const char *str);    /** returns negative value on error */
1095 
1096 HTSLIB_EXPORT
1097 char *bam_flag2str(int flag);   /** The string must be freed by the user */
1098 
1099 /*! @function
1100  @abstract  Set the name of the query
1101  @param  b  pointer to an alignment
1102  @return    0 on success, -1 on failure
1103  */
1104 HTSLIB_EXPORT
1105 int bam_set_qname(bam1_t *b, const char *qname);
1106 
1107 /*! @function
1108  @abstract  Parse a CIGAR string into a uint32_t array
1109  @param  in      [in]  pointer to the source string
1110  @param  end     [out] address of the pointer to the new end of the input string
1111                        can be NULL
1112  @param  a_cigar [in/out]  address of the destination uint32_t buffer
1113  @param  a_mem   [in/out]  address of the allocated number of buffer elements
1114  @return         number of processed CIGAR operators; -1 on error
1115  */
1116 HTSLIB_EXPORT
1117 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem);
1118 
1119 /*! @function
1120  @abstract  Parse a CIGAR string into a bam1_t struct
1121  @param  in      [in]  pointer to the source string
1122  @param  end     [out] address of the pointer to the new end of the input string
1123                        can be NULL
1124  @param  b       [in/out]  address of the destination bam1_t struct
1125  @return         number of processed CIGAR operators; -1 on error
1126  */
1127 HTSLIB_EXPORT
1128 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b);
1129 
1130 /*************************
1131  *** BAM/CRAM indexing ***
1132  *************************/
1133 
1134 // These BAM iterator functions work only on BAM files.  To work with either
1135 // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
1136 #define bam_itr_destroy(iter) hts_itr_destroy(iter)
1137 #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end)
1138 #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region)
1139 #define bam_itr_next(htsfp, itr, r) sam_itr_next((htsfp), (itr), (r))
1140 
1141 // Load/build .csi or .bai BAM index file.  Does not work with CRAM.
1142 // It is recommended to use the sam_index_* functions below instead.
1143 #define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI)
1144 #define bam_index_build(fn, min_shift) (sam_index_build((fn), (min_shift)))
1145 
1146 /// Initialise fp->idx for the current format type for SAM, BAM and CRAM types .
1147 /** @param fp        File handle for the data file being written.
1148     @param h         Bam header structured (needed for BAI and CSI).
1149     @param min_shift 0 for BAI, or larger for CSI (CSI defaults to 14).
1150     @param fnidx     Filename to write index to.  This pointer must remain valid
1151                      until after sam_idx_save is called.
1152     @return          0 on success, <0 on failure.
1153 
1154     @note This must be called after the header has been written, but before
1155           any other data.
1156 */
1157 HTSLIB_EXPORT
1158 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx);
1159 
1160 /// Writes the index initialised with sam_idx_init to disk.
1161 /** @param fp        File handle for the data file being written.
1162     @return          0 on success, <0 on failure.
1163 */
1164 HTSLIB_EXPORT
1165 int sam_idx_save(htsFile *fp) HTS_RESULT_USED;
1166 
1167 /// Load a BAM (.csi or .bai) or CRAM (.crai) index file
1168 /** @param fp  File handle of the data file whose index is being opened
1169     @param fn  BAM/CRAM/etc filename to search alongside for the index file
1170     @return  The index, or NULL if an error occurred.
1171 
1172 Equivalent to sam_index_load3(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1173 */
1174 HTSLIB_EXPORT
1175 hts_idx_t *sam_index_load(htsFile *fp, const char *fn);
1176 
1177 /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
1178 /** @param fp     File handle of the data file whose index is being opened
1179     @param fn     BAM/CRAM/etc data file filename
1180     @param fnidx  Index filename, or NULL to search alongside @a fn
1181     @return  The index, or NULL if an error occurred.
1182 
1183 Equivalent to sam_index_load3(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1184 */
1185 HTSLIB_EXPORT
1186 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx);
1187 
1188 /// Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
1189 /** @param fp     File handle of the data file whose index is being opened
1190     @param fn     BAM/CRAM/etc data file filename
1191     @param fnidx  Index filename, or NULL to search alongside @a fn
1192     @param flags  Flags to alter behaviour (see description)
1193     @return  The index, or NULL if an error occurred.
1194 
1195 The @p flags parameter can be set to a combination of the following values:
1196 
1197         HTS_IDX_SAVE_REMOTE   Save a local copy of any remote indexes
1198         HTS_IDX_SILENT_FAIL   Fail silently if the index is not present
1199 
1200 Note that HTS_IDX_SAVE_REMOTE has no effect for remote CRAM indexes.  They
1201 are always downloaded and never cached locally.
1202 
1203 The index struct returned by a successful call should be freed
1204 via hts_idx_destroy() when it is no longer needed.
1205 */
1206 HTSLIB_EXPORT
1207 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags);
1208 
1209 /// Generate and save an index file
1210 /** @param fn        Input BAM/etc filename, to which .csi/etc will be added
1211     @param min_shift Positive to generate CSI, or 0 to generate BAI
1212     @return  0 if successful, or negative if an error occurred (usually -1; or
1213              -2: opening fn failed; -3: format not indexable; -4:
1214              failed to create and/or save the index)
1215 */
1216 HTSLIB_EXPORT
1217 int sam_index_build(const char *fn, int min_shift) HTS_RESULT_USED;
1218 
1219 /// Generate and save an index to a specific file
1220 /** @param fn        Input BAM/CRAM/etc filename
1221     @param fnidx     Output filename, or NULL to add .bai/.csi/etc to @a fn
1222     @param min_shift Positive to generate CSI, or 0 to generate BAI
1223     @return  0 if successful, or negative if an error occurred (see
1224              sam_index_build for error codes)
1225 */
1226 HTSLIB_EXPORT
1227 int sam_index_build2(const char *fn, const char *fnidx, int min_shift) HTS_RESULT_USED;
1228 
1229 /// Generate and save an index to a specific file
1230 /** @param fn        Input BAM/CRAM/etc filename
1231     @param fnidx     Output filename, or NULL to add .bai/.csi/etc to @a fn
1232     @param min_shift Positive to generate CSI, or 0 to generate BAI
1233     @param nthreads  Number of threads to use when building the index
1234     @return  0 if successful, or negative if an error occurred (see
1235              sam_index_build for error codes)
1236 */
1237 HTSLIB_EXPORT
1238 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) HTS_RESULT_USED;
1239 
1240 /// Free a SAM iterator
1241 /// @param iter     Iterator to free
1242 #define sam_itr_destroy(iter) hts_itr_destroy(iter)
1243 
1244 /// Create a BAM/CRAM iterator
1245 /** @param idx     Index
1246     @param tid     Target id
1247     @param beg     Start position in target
1248     @param end     End position in target
1249     @return An iterator on success; NULL on failure
1250 
1251 The following special values (defined in htslib/hts.h)can be used for @p tid.
1252 When using one of these values, @p beg and @p end are ignored.
1253 
1254   HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
1255   HTS_IDX_START  iterates over the entire file
1256   HTS_IDX_REST   iterates from the current position to the end of the file
1257   HTS_IDX_NONE   always returns "no more alignment records"
1258 
1259 When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx.
1260  */
1261 HTSLIB_EXPORT
1262 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end);
1263 
1264 /// Create a SAM/BAM/CRAM iterator
1265 /** @param idx     Index
1266     @param hdr     Header
1267     @param region  Region specification
1268     @return An iterator on success; NULL on failure
1269 
1270 Regions are parsed by hts_parse_reg(), and take one of the following forms:
1271 
1272 region          | Outputs
1273 --------------- | -------------
1274 REF             | All reads with RNAME REF
1275 REF:            | All reads with RNAME REF
1276 REF:START       | Reads with RNAME REF overlapping START to end of REF
1277 REF:-END        | Reads with RNAME REF overlapping start of REF to END
1278 REF:START-END   | Reads with RNAME REF overlapping START to END
1279 .               | All reads from the start of the file
1280 *               | Unmapped reads at the end of the file (RNAME '*' in SAM)
1281 
1282 The form `REF:` should be used when the reference name itself contains a colon.
1283 
1284 Note that SAM files must be bgzf-compressed for iterators to work.
1285  */
1286 HTSLIB_EXPORT
1287 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region);
1288 
1289 /// Create a multi-region iterator
1290 /** @param idx       Index
1291     @param hdr       Header
1292     @param reglist   Array of regions to iterate over
1293     @param regcount  Number of items in reglist
1294 
1295 Each @p reglist entry should have the reference name in the `reg` field, an
1296 array of regions for that reference in `intervals` and the number of items
1297 in `intervals` should be stored in `count`.  No other fields need to be filled
1298 in.
1299 
1300 The iterator will return all reads overlapping the given regions.  If a read
1301 overlaps more than one region, it will only be returned once.
1302  */
1303 HTSLIB_EXPORT
1304 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount);
1305 
1306 /// Create a multi-region iterator
1307 /** @param idx       Index
1308     @param hdr       Header
1309     @param regarray  Array of ref:interval region specifiers
1310     @param regcount  Number of items in regarray
1311 
1312 Each @p regarray entry is parsed by hts_parse_reg(), and takes one of the
1313 following forms:
1314 
1315 region          | Outputs
1316 --------------- | -------------
1317 REF             | All reads with RNAME REF
1318 REF:            | All reads with RNAME REF
1319 REF:START       | Reads with RNAME REF overlapping START to end of REF
1320 REF:-END        | Reads with RNAME REF overlapping start of REF to END
1321 REF:START-END   | Reads with RNAME REF overlapping START to END
1322 .               | All reads from the start of the file
1323 *               | Unmapped reads at the end of the file (RNAME '*' in SAM)
1324 
1325 The form `REF:` should be used when the reference name itself contains a colon.
1326 
1327 The iterator will return all reads overlapping the given regions.  If a read
1328 overlaps more than one region, it will only be returned once.
1329  */
1330 HTSLIB_EXPORT
1331 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount);
1332 
1333 /// Get the next read from a SAM/BAM/CRAM iterator
1334 /** @param htsfp       Htsfile pointer for the input file
1335     @param itr         Iterator
1336     @param r           Pointer to a bam1_t struct
1337     @return >= 0 on success; -1 when there is no more data; < -1 on error
1338  */
sam_itr_next(htsFile * htsfp,hts_itr_t * itr,bam1_t * r)1339 static inline int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) {
1340     if (!htsfp->is_bgzf && !htsfp->is_cram) {
1341         hts_log_error("%s not BGZF compressed", htsfp->fn ? htsfp->fn : "File");
1342         return -2;
1343     }
1344     if (!itr) {
1345         hts_log_error("Null iterator");
1346         return -2;
1347     }
1348 
1349     if (itr->multi)
1350         return hts_itr_multi_next(htsfp, itr, r);
1351     else
1352         return hts_itr_next(htsfp->is_bgzf ? htsfp->fp.bgzf : NULL, itr, r, htsfp);
1353 }
1354 
1355 /// Get the next read from a BAM/CRAM multi-iterator
1356 /** @param htsfp       Htsfile pointer for the input file
1357     @param itr         Iterator
1358     @param r           Pointer to a bam1_t struct
1359     @return >= 0 on success; -1 when there is no more data; < -1 on error
1360  */
1361 #define sam_itr_multi_next(htsfp, itr, r) sam_itr_next(htsfp, itr, r)
1362 
1363 HTSLIB_EXPORT
1364 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
1365                              hts_pos_t *beg, hts_pos_t *end, int flags);
1366 
1367     /***************
1368      *** SAM I/O ***
1369      ***************/
1370 
1371     #define sam_open(fn, mode) (hts_open((fn), (mode)))
1372     #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt)))
1373     #define sam_flush(fp) hts_flush((fp))
1374     #define sam_close(fp) hts_close(fp)
1375 
1376     HTSLIB_EXPORT
1377     int sam_open_mode(char *mode, const char *fn, const char *format);
1378 
1379     // A version of sam_open_mode that can handle ,key=value options.
1380     // The format string is allocated and returned, to be freed by the caller.
1381     // Prefix should be "r" or "w",
1382     HTSLIB_EXPORT
1383     char *sam_open_mode_opts(const char *fn,
1384                              const char *mode,
1385                              const char *format);
1386 
1387     HTSLIB_EXPORT
1388     int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val);
1389 
1390     HTSLIB_EXPORT
1391     int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
1392     HTSLIB_EXPORT
1393     int sam_format1(const sam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED;
1394 
1395 /// sam_read1 - Read a record from a file
1396 /** @param fp   Pointer to the source file
1397  *  @param h    Pointer to the header previously read (fully or partially)
1398  *  @param b    Pointer to the record placeholder
1399  *  @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error
1400  */
1401     HTSLIB_EXPORT
1402     int sam_read1(samFile *fp, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
1403 /// sam_write1 - Write a record to a file
1404 /** @param fp    Pointer to the destination file
1405  *  @param h     Pointer to the header structure previously read
1406  *  @param b     Pointer to the record to be written
1407  *  @return >= 0 on successfully writing the record, -1 on error
1408  */
1409     HTSLIB_EXPORT
1410     int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED;
1411 
1412 // Forward declaration, see hts_expr.h for full.
1413 struct hts_filter_t;
1414 
1415 /// sam_passes_filter - Checks whether a record passes an hts_filter.
1416 /** @param h      Pointer to the header structure previously read
1417  *  @param b      Pointer to the BAM record to be checked
1418  *  @param filt   Pointer to the filter, created from hts_filter_init.
1419  *  @return       1 if passes, 0 if not, and <0 on error.
1420  */
1421 HTSLIB_EXPORT
1422 int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,
1423                       struct hts_filter_t *filt);
1424 
1425     /*************************************
1426      *** Manipulating auxiliary fields ***
1427      *************************************/
1428 
1429 /// Converts a BAM aux tag to SAM format
1430 /*
1431  * @param b    Pointer to the bam record
1432  * @param key  Two letter tag key
1433  * @param type Single letter type code: ACcSsIifHZB.
1434  * @param tag  Tag data pointer, in BAM format
1435  * @param end  Pointer to end of bam record (largest extent of tag)
1436  * @param ks   kstring to write the formatted tag to
1437  *
1438  * @return pointer to end of tag on success,
1439  *         NULL on failure.
1440  *
1441  * @discussion The three separate parameters key, type, tag may be
1442  * derived from a s=bam_aux_get() query as s-2, *s and s+1.  However
1443  * it is recommended to use bam_aux_get_str in this situation.
1444  * The desire to split these parameters up is for potential processing
1445  * of non-BAM formats that encode using a BAM type mechanism
1446  * (such as the internal CRAM representation).
1447  */
sam_format_aux1(const uint8_t * key,const uint8_t type,const uint8_t * tag,const uint8_t * end,kstring_t * ks)1448 static inline const uint8_t *sam_format_aux1(const uint8_t *key,
1449                                              const uint8_t type,
1450                                              const uint8_t *tag,
1451                                              const uint8_t *end,
1452                                              kstring_t *ks) {
1453     int r = 0;
1454     const uint8_t *s = tag; // brevity and consistency with other code.
1455     r |= kputsn_((char*)key, 2, ks) < 0;
1456     r |= kputc_(':', ks) < 0;
1457     if (type == 'C') {
1458         r |= kputsn_("i:", 2, ks) < 0;
1459         r |= kputw(*s, ks) < 0;
1460         ++s;
1461     } else if (type == 'c') {
1462         r |= kputsn_("i:", 2, ks) < 0;
1463         r |= kputw(*(int8_t*)s, ks) < 0;
1464         ++s;
1465     } else if (type == 'S') {
1466         if (end - s >= 2) {
1467             r |= kputsn_("i:", 2, ks) < 0;
1468             r |= kputuw(le_to_u16(s), ks) < 0;
1469             s += 2;
1470         } else goto bad_aux;
1471     } else if (type == 's') {
1472         if (end - s >= 2) {
1473             r |= kputsn_("i:", 2, ks) < 0;
1474             r |= kputw(le_to_i16(s), ks) < 0;
1475             s += 2;
1476         } else goto bad_aux;
1477     } else if (type == 'I') {
1478         if (end - s >= 4) {
1479             r |= kputsn_("i:", 2, ks) < 0;
1480             r |= kputuw(le_to_u32(s), ks) < 0;
1481             s += 4;
1482         } else goto bad_aux;
1483     } else if (type == 'i') {
1484         if (end - s >= 4) {
1485             r |= kputsn_("i:", 2, ks) < 0;
1486             r |= kputw(le_to_i32(s), ks) < 0;
1487             s += 4;
1488         } else goto bad_aux;
1489     } else if (type == 'A') {
1490         r |= kputsn_("A:", 2, ks) < 0;
1491         r |= kputc_(*s, ks) < 0;
1492         ++s;
1493     } else if (type == 'f') {
1494         if (end - s >= 4) {
1495             ksprintf(ks, "f:%g", le_to_float(s));
1496             s += 4;
1497         } else goto bad_aux;
1498 
1499     } else if (type == 'd') {
1500         // NB: "d" is not an official type in the SAM spec.
1501         // However for unknown reasons samtools has always supported this.
1502         // We believe, HOPE, it is not in general usage and we do not
1503         // encourage it.
1504         if (end - s >= 8) {
1505             ksprintf(ks, "d:%g", le_to_double(s));
1506             s += 8;
1507         } else goto bad_aux;
1508     } else if (type == 'Z' || type == 'H') {
1509         r |= kputc_(type, ks) < 0;
1510         r |= kputc_(':', ks) < 0;
1511         while (s < end && *s) r |= kputc_(*s++, ks) < 0;
1512         if (s >= end)
1513             goto bad_aux;
1514         ++s;
1515     } else if (type == 'B') {
1516         uint8_t sub_type = *(s++);
1517         int sub_type_size;
1518 
1519         // or externalise sam.c's aux_type2size function?
1520         switch (sub_type) {
1521         case 'A': case 'c': case 'C':
1522             sub_type_size = 1;
1523             break;
1524         case 's': case 'S':
1525             sub_type_size = 2;
1526             break;
1527         case 'i': case 'I': case 'f':
1528             sub_type_size = 4;
1529             break;
1530         default:
1531             sub_type_size = 0;
1532             break;
1533         }
1534 
1535         uint32_t i, n;
1536         if (sub_type_size == 0 || end - s < 4)
1537             goto bad_aux;
1538         n = le_to_u32(s);
1539         s += 4; // now points to the start of the array
1540         if ((end - s) / sub_type_size < n)
1541             goto bad_aux;
1542         r |= kputsn_("B:", 2, ks) < 0;
1543         r |= kputc(sub_type, ks) < 0; // write the type
1544         switch (sub_type) {
1545         case 'c':
1546             if (ks_expand(ks, n*2) < 0) goto mem_err;
1547             for (i = 0; i < n; ++i) {
1548                 ks->s[ks->l++] = ',';
1549                 r |= kputw(*(int8_t*)s, ks) < 0;
1550                 ++s;
1551             }
1552             break;
1553         case 'C':
1554             if (ks_expand(ks, n*2) < 0) goto mem_err;
1555             for (i = 0; i < n; ++i) {
1556                 ks->s[ks->l++] = ',';
1557                 r |= kputuw(*(uint8_t*)s, ks) < 0;
1558                 ++s;
1559             }
1560             break;
1561         case 's':
1562             if (ks_expand(ks, n*4) < 0) goto mem_err;
1563             for (i = 0; i < n; ++i) {
1564                 ks->s[ks->l++] = ',';
1565                 r |= kputw(le_to_i16(s), ks) < 0;
1566                 s += 2;
1567             }
1568             break;
1569         case 'S':
1570             if (ks_expand(ks, n*4) < 0) goto mem_err;
1571             for (i = 0; i < n; ++i) {
1572                 ks->s[ks->l++] = ',';
1573                 r |= kputuw(le_to_u16(s), ks) < 0;
1574                 s += 2;
1575             }
1576             break;
1577         case 'i':
1578             if (ks_expand(ks, n*6) < 0) goto mem_err;
1579             for (i = 0; i < n; ++i) {
1580                 ks->s[ks->l++] = ',';
1581                 r |= kputw(le_to_i32(s), ks) < 0;
1582                 s += 4;
1583             }
1584             break;
1585         case 'I':
1586             if (ks_expand(ks, n*6) < 0) goto mem_err;
1587             for (i = 0; i < n; ++i) {
1588                 ks->s[ks->l++] = ',';
1589                 r |= kputuw(le_to_u32(s), ks) < 0;
1590                 s += 4;
1591             }
1592             break;
1593         case 'f':
1594             if (ks_expand(ks, n*8) < 0) goto mem_err;
1595             for (i = 0; i < n; ++i) {
1596                 ks->s[ks->l++] = ',';
1597                 r |= kputd(le_to_float(s), ks) < 0;
1598                 s += 4;
1599             }
1600             break;
1601         default:
1602             goto bad_aux;
1603         }
1604     } else { // Unknown type
1605         goto bad_aux;
1606     }
1607     return r ? NULL : s;
1608 
1609  bad_aux:
1610     errno = EINVAL;
1611     return NULL;
1612 
1613  mem_err:
1614     hts_log_error("Out of memory");
1615     errno = ENOMEM;
1616     return NULL;
1617 }
1618 
1619 /// Return a pointer to an aux record
1620 /** @param b   Pointer to the bam record
1621     @param tag Desired aux tag
1622     @return Pointer to the tag data, or NULL if tag is not present or on error
1623     If the tag is not present, this function returns NULL and sets errno to
1624     ENOENT.  If the bam record's aux data is corrupt (either a tag has an
1625     invalid type, or the last record is incomplete) then errno is set to
1626     EINVAL and NULL is returned.
1627  */
1628 HTSLIB_EXPORT
1629 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
1630 
1631 /// Return a SAM formatting string containing a BAM tag
1632 /** @param b   Pointer to the bam record
1633     @param tag Desired aux tag
1634     @param s   The kstring to write to.
1635 
1636     @return 1 on success,
1637             0 on no tag found with errno = ENOENT,
1638            -1 on error (errno will be either EINVAL or ENOMEM).
1639  */
bam_aux_get_str(const bam1_t * b,const char tag[2],kstring_t * s)1640 static inline int bam_aux_get_str(const bam1_t *b,
1641                                   const char tag[2],
1642                                   kstring_t *s) {
1643     const uint8_t *t = bam_aux_get(b, tag);
1644     if (!t)
1645         return errno == ENOENT ? 0 : -1;
1646 
1647     if (!sam_format_aux1(t-2, *t, t+1, b->data + b->l_data, s))
1648         return -1;
1649 
1650     return 1;
1651 }
1652 
1653 /// Get an integer aux value
1654 /** @param s Pointer to the tag data, as returned by bam_aux_get()
1655     @return The value, or 0 if the tag was not an integer type
1656     If the tag is not an integer type, errno is set to EINVAL.  This function
1657     will not return the value of floating-point tags.
1658 */
1659 HTSLIB_EXPORT
1660 int64_t bam_aux2i(const uint8_t *s);
1661 
1662 /// Get an integer aux value
1663 /** @param s Pointer to the tag data, as returned by bam_aux_get()
1664     @return The value, or 0 if the tag was not an integer type
1665     If the tag is not an numeric type, errno is set to EINVAL.  The value of
1666     integer flags will be returned cast to a double.
1667 */
1668 HTSLIB_EXPORT
1669 double bam_aux2f(const uint8_t *s);
1670 
1671 /// Get a character aux value
1672 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1673     @return The value, or 0 if the tag was not a character ('A') type
1674     If the tag is not a character type, errno is set to EINVAL.
1675 */
1676 HTSLIB_EXPORT
1677 char bam_aux2A(const uint8_t *s);
1678 
1679 /// Get a string aux value
1680 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1681     @return Pointer to the string, or NULL if the tag was not a string type
1682     If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL.
1683 */
1684 HTSLIB_EXPORT
1685 char *bam_aux2Z(const uint8_t *s);
1686 
1687 /// Get the length of an array-type ('B') tag
1688 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1689     @return The length of the array, or 0 if the tag is not an array type.
1690     If the tag is not an array type, errno is set to EINVAL.
1691  */
1692 HTSLIB_EXPORT
1693 uint32_t bam_auxB_len(const uint8_t *s);
1694 
1695 /// Get an integer value from an array-type tag
1696 /** @param s   Pointer to the tag data, as returned by bam_aux_get().
1697     @param idx 0-based Index into the array
1698     @return The idx'th value, or 0 on error.
1699     If the array is not an integer type, errno is set to EINVAL.  If idx
1700     is greater than or equal to  the value returned by bam_auxB_len(s),
1701     errno is set to ERANGE.  In both cases, 0 will be returned.
1702  */
1703 HTSLIB_EXPORT
1704 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx);
1705 
1706 /// Get a floating-point value from an array-type tag
1707 /** @param s   Pointer to the tag data, as returned by bam_aux_get().
1708     @param idx 0-based Index into the array
1709     @return The idx'th value, or 0.0 on error.
1710     If the array is not a numeric type, errno is set to EINVAL.  This can
1711     only actually happen if the input record has an invalid type field.  If
1712     idx is greater than or equal to  the value returned by bam_auxB_len(s),
1713     errno is set to ERANGE.  In both cases, 0.0 will be returned.
1714  */
1715 HTSLIB_EXPORT
1716 double bam_auxB2f(const uint8_t *s, uint32_t idx);
1717 
1718 /// Append tag data to a bam record
1719 /* @param b    The bam record to append to.
1720    @param tag  Tag identifier
1721    @param type Tag data type
1722    @param len  Length of the data in bytes
1723    @param data The data to append
1724    @return 0 on success; -1 on failure.
1725 If there is not enough space to store the additional tag, errno is set to
1726 ENOMEM.  If the type is invalid, errno may be set to EINVAL.  errno is
1727 also set to EINVAL if the bam record's aux data is corrupt.
1728 */
1729 HTSLIB_EXPORT
1730 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data);
1731 
1732 /// Delete tag data from a bam record
1733 /* @param b The bam record to update
1734    @param s Pointer to the tag to delete, as returned by bam_aux_get().
1735    @return 0 on success; -1 on failure
1736    If the bam record's aux data is corrupt, errno is set to EINVAL and this
1737    function returns -1;
1738 */
1739 HTSLIB_EXPORT
1740 int bam_aux_del(bam1_t *b, uint8_t *s);
1741 
1742 /// Update or add a string-type tag
1743 /* @param b    The bam record to update
1744    @param tag  Tag identifier
1745    @param len  The length of the new string
1746    @param data The new string
1747    @return 0 on success, -1 on failure
1748    This function will not change the ordering of tags in the bam record.
1749    New tags will be appended to any existing aux records.
1750 
1751    If @p len is less than zero, the length of the input string will be
1752    calculated using strlen().  Otherwise exactly @p len bytes will be
1753    copied from @p data to make the new tag.  If these bytes do not
1754    include a terminating NUL character, one will be added.  (Note that
1755    versions of HTSlib up to 1.10.2 had different behaviour here and
1756    simply copied @p len bytes from data.  To generate a valid tag it
1757    was necessary to ensure the last character was a NUL, and include
1758    it in @p len.)
1759 
1760    On failure, errno may be set to one of the following values:
1761 
1762    EINVAL: The bam record's aux data is corrupt or an existing tag with the
1763    given ID is not of type 'Z'.
1764 
1765    ENOMEM: The bam data needs to be expanded and either the attempt to
1766    reallocate the data buffer failed or the resulting buffer would be
1767    longer than the maximum size allowed in a bam record (2Gbytes).
1768 */
1769 HTSLIB_EXPORT
1770 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data);
1771 
1772 /// Update or add an integer tag
1773 /* @param b    The bam record to update
1774    @param tag  Tag identifier
1775    @param val  The new value
1776    @return 0 on success, -1 on failure
1777    This function will not change the ordering of tags in the bam record.
1778    New tags will be appended to any existing aux records.
1779 
1780    On failure, errno may be set to one of the following values:
1781 
1782    EINVAL: The bam record's aux data is corrupt or an existing tag with the
1783    given ID is not of an integer type (c, C, s, S, i or I).
1784 
1785    EOVERFLOW (or ERANGE on systems that do not have EOVERFLOW): val is
1786    outside the range that can be stored in an integer bam tag (-2147483647
1787    to 4294967295).
1788 
1789    ENOMEM: The bam data needs to be expanded and either the attempt to
1790    reallocate the data buffer failed or the resulting buffer would be
1791    longer than the maximum size allowed in a bam record (2Gbytes).
1792 */
1793 HTSLIB_EXPORT
1794 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val);
1795 
1796 /// Update or add a floating-point tag
1797 /* @param b    The bam record to update
1798    @param tag  Tag identifier
1799    @param val  The new value
1800    @return 0 on success, -1 on failure
1801    This function will not change the ordering of tags in the bam record.
1802    New tags will be appended to any existing aux records.
1803 
1804    On failure, errno may be set to one of the following values:
1805 
1806    EINVAL: The bam record's aux data is corrupt or an existing tag with the
1807    given ID is not of a float type.
1808 
1809    ENOMEM: The bam data needs to be expanded and either the attempt to
1810    reallocate the data buffer failed or the resulting buffer would be
1811    longer than the maximum size allowed in a bam record (2Gbytes).
1812 */
1813 HTSLIB_EXPORT
1814 int bam_aux_update_float(bam1_t *b, const char tag[2], float val);
1815 
1816 /// Update or add an array tag
1817 /* @param b     The bam record to update
1818    @param tag   Tag identifier
1819    @param type  Data type (one of c, C, s, S, i, I or f)
1820    @param items Number of items
1821    @param data  Pointer to data
1822    @return 0 on success, -1 on failure
1823    The type parameter indicates the how the data is interpreted:
1824 
1825    Letter code | Data type | Item Size (bytes)
1826    ----------- | --------- | -----------------
1827    c           | int8_t    | 1
1828    C           | uint8_t   | 1
1829    s           | int16_t   | 2
1830    S           | uint16_t  | 2
1831    i           | int32_t   | 4
1832    I           | uint32_t  | 4
1833    f           | float     | 4
1834 
1835    This function will not change the ordering of tags in the bam record.
1836    New tags will be appended to any existing aux records.  The bam record
1837    will grow or shrink in order to accommodate the new data.
1838 
1839    The data parameter must not point to any data in the bam record itself or
1840    undefined behaviour may result.
1841 
1842    On failure, errno may be set to one of the following values:
1843 
1844    EINVAL: The bam record's aux data is corrupt, an existing tag with the
1845    given ID is not of an array type or the type parameter is not one of
1846    the values listed above.
1847 
1848    ENOMEM: The bam data needs to be expanded and either the attempt to
1849    reallocate the data buffer failed or the resulting buffer would be
1850    longer than the maximum size allowed in a bam record (2Gbytes).
1851 */
1852 HTSLIB_EXPORT
1853 int bam_aux_update_array(bam1_t *b, const char tag[2],
1854                          uint8_t type, uint32_t items, void *data);
1855 
1856 /**************************
1857  *** Pileup and Mpileup ***
1858  **************************/
1859 
1860 #if !defined(BAM_NO_PILEUP)
1861 
1862 /*! @typedef
1863  @abstract Generic pileup 'client data'.
1864 
1865  @discussion The pileup iterator allows setting a constructor and
1866  destructor function, which will be called every time a sequence is
1867  fetched and discarded.  This permits caching of per-sequence data in
1868  a tidy manner during the pileup process.  This union is the cached
1869  data to be manipulated by the "client" (the caller of pileup).
1870 */
1871 typedef union {
1872     void *p;
1873     int64_t i;
1874     double f;
1875 } bam_pileup_cd;
1876 
1877 /*! @typedef
1878  @abstract Structure for one alignment covering the pileup position.
1879  @field  b          pointer to the alignment
1880  @field  qpos       position of the read base at the pileup site, 0-based
1881  @field  indel      indel length; 0 for no indel, positive for ins and negative for del
1882  @field  level      the level of the read in the "viewer" mode
1883  @field  is_del     1 iff the base on the padded read is a deletion
1884  @field  is_head    1 iff this is the first base in the query sequence
1885  @field  is_tail    1 iff this is the last base in the query sequence
1886  @field  is_refskip 1 iff the base on the padded read is part of CIGAR N op
1887  @field  aux        (used by bcf_call_gap_prep())
1888  @field  cigar_ind  index of the CIGAR operator that has just been processed
1889 
1890  @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
1891  difference between the two functions is that the former does not
1892  set bam_pileup1_t::level, while the later does. Level helps the
1893  implementation of alignment viewers, but calculating this has some
1894  overhead.
1895  */
1896 typedef struct bam_pileup1_t {
1897     bam1_t *b;
1898     int32_t qpos;
1899     int indel, level;
1900     uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, /* reserved */ :1, aux:27;
1901     bam_pileup_cd cd; // generic per-struct data, owned by caller.
1902     int cigar_ind;
1903 } bam_pileup1_t;
1904 
1905 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
1906 
1907 struct bam_plp_s;
1908 typedef struct bam_plp_s *bam_plp_t;
1909 
1910 struct bam_mplp_s;
1911 typedef struct bam_mplp_s *bam_mplp_t;
1912 
1913     /**
1914      *  bam_plp_init() - sets an iterator over multiple
1915      *  @func:      see mplp_func in bam_plcmd.c in samtools for an example. Expected return
1916      *              status: 0 on success, -1 on end, < -1 on non-recoverable errors
1917      *  @data:      user data to pass to @func
1918      *
1919      *  The struct returned by a successful call should be freed
1920      *  via bam_plp_destroy() when it is no longer needed.
1921      */
1922     HTSLIB_EXPORT
1923     bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
1924 
1925     HTSLIB_EXPORT
1926     void bam_plp_destroy(bam_plp_t iter);
1927 
1928     HTSLIB_EXPORT
1929     int bam_plp_push(bam_plp_t iter, const bam1_t *b);
1930 
1931     HTSLIB_EXPORT
1932     const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
1933 
1934     HTSLIB_EXPORT
1935     const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
1936 
1937     HTSLIB_EXPORT
1938     const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp);
1939 
1940     HTSLIB_EXPORT
1941     const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp);
1942 
1943     HTSLIB_EXPORT
1944     void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
1945 
1946     HTSLIB_EXPORT
1947     void bam_plp_reset(bam_plp_t iter);
1948 
1949     /**
1950      *  bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields.
1951      *  @plp:       The bam_plp_t initialised using bam_plp_init.
1952      *  @func:      The callback function itself.  When called, it is given
1953      *              the data argument (specified in bam_plp_init), the bam
1954      *              structure and a pointer to a locally allocated
1955      *              bam_pileup_cd union.  This union will also be present in
1956      *              each bam_pileup1_t created.
1957      *              The callback function should have a negative return
1958      *              value to indicate an error. (Similarly for destructor.)
1959      */
1960     HTSLIB_EXPORT
1961     void bam_plp_constructor(bam_plp_t plp,
1962                              int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
1963     HTSLIB_EXPORT
1964     void bam_plp_destructor(bam_plp_t plp,
1965                             int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
1966 
1967     /// Get pileup padded insertion sequence
1968     /**
1969      * @param p       pileup data
1970      * @param ins     the kstring where the insertion sequence will be written
1971      * @param del_len location for deletion length
1972      * @return the length of insertion string on success; -1 on failure.
1973      *
1974      * Fills out the kstring with the padded insertion sequence for the current
1975      * location in 'p'.  If this is not an insertion site, the string is blank.
1976      *
1977      * If del_len is not NULL, the location pointed to is set to the length of
1978      * any deletion immediately following the insertion, or zero if none.
1979      */
1980     HTSLIB_EXPORT
1981     int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) HTS_RESULT_USED;
1982 
1983 
1984     /*! @typedef
1985      @abstract An opaque type used for caching base modification state between
1986      successive calls to bam_mods_* functions.
1987     */
1988     typedef struct hts_base_mod_state hts_base_mod_state;
1989 
1990     /// Get pileup padded insertion sequence, including base modifications
1991     /**
1992      * @param p       pileup data
1993      * @param m       state data for the base modification finder
1994      * @param ins     the kstring where the insertion sequence will be written
1995      * @param del_len location for deletion length
1996      * @return the number of insertion string on success, with string length
1997      *         being accessable via ins->l; -1 on failure.
1998      *
1999      * Fills out the kstring with the padded insertion sequence for the current
2000      * location in 'p'.  If this is not an insertion site, the string is blank.
2001      *
2002      * The modification state needs to have been previously initialised using
2003      * bam_parse_basemod.  It is permitted to be passed in as NULL, in which
2004      * case this function outputs identically to bam_plp_insertion.
2005      *
2006      * If del_len is not NULL, the location pointed to is set to the length of
2007      * any deletion immediately following the insertion, or zero if none.
2008      */
2009     HTSLIB_EXPORT
2010     int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m,
2011                               kstring_t *ins, int *del_len) HTS_RESULT_USED;
2012 
2013     /// Create a new bam_mplp_t structure
2014     /** The struct returned by a successful call should be freed
2015      *  via bam_mplp_destroy() when it is no longer needed.
2016      */
2017     HTSLIB_EXPORT
2018     bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
2019 
2020     /// Set up mpileup overlap detection
2021     /**
2022      * @param iter    mpileup iterator
2023      * @return 0 on success; a negative value on error
2024      *
2025      *  If called, mpileup will detect overlapping
2026      *  read pairs and for each base pair set the base quality of the
2027      *  lower-quality base to zero, thus effectively discarding it from
2028      *  calling. If the two bases are identical, the quality of the other base
2029      *  is increased to the sum of their qualities (capped at 200), otherwise
2030      *  it is multiplied by 0.8.
2031      */
2032     HTSLIB_EXPORT
2033     int bam_mplp_init_overlaps(bam_mplp_t iter);
2034 
2035     HTSLIB_EXPORT
2036     void bam_mplp_destroy(bam_mplp_t iter);
2037 
2038     HTSLIB_EXPORT
2039     void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
2040 
2041     HTSLIB_EXPORT
2042     int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
2043 
2044     HTSLIB_EXPORT
2045     int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp);
2046 
2047     HTSLIB_EXPORT
2048     void bam_mplp_reset(bam_mplp_t iter);
2049 
2050     HTSLIB_EXPORT
2051     void bam_mplp_constructor(bam_mplp_t iter,
2052                               int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
2053 
2054     HTSLIB_EXPORT
2055     void bam_mplp_destructor(bam_mplp_t iter,
2056                              int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
2057 
2058 #endif // ~!defined(BAM_NO_PILEUP)
2059 
2060 
2061 /***********************************
2062  * BAQ calculation and realignment *
2063  ***********************************/
2064 
2065 HTSLIB_EXPORT
2066 int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres);
2067 
2068 // Used as flag parameter in sam_prob_realn.
2069 enum htsRealnFlags {
2070     BAQ_APPLY = 1,
2071     BAQ_EXTEND = 2,
2072     BAQ_REDO = 4,
2073 
2074     // Platform subfield, in bit position 3 onwards
2075     BAQ_AUTO = 0<<3,
2076     BAQ_ILLUMINA = 1<<3,
2077     BAQ_PACBIOCCS = 2<<3,
2078     BAQ_PACBIO = 3<<3,
2079     BAQ_ONT = 4<<3,
2080     BAQ_GENAPSYS = 5<<3
2081 };
2082 
2083 /// Calculate BAQ scores
2084 /** @param b   BAM record
2085     @param ref     Reference sequence
2086     @param ref_len Reference sequence length
2087     @param flag    Flags, see description
2088     @return 0 on success \n
2089            -1 if the read was unmapped, zero length, had no quality values, did not have at least one M, X or = CIGAR operator, or included a reference skip. \n
2090            -3 if BAQ alignment has already been done and does not need to be applied, or has already been applied. \n
2091            -4 if alignment failed (most likely due to running out of memory)
2092 
2093 This function calculates base alignment quality (BAQ) values using the method
2094 described in "Improving SNP discovery by base alignment quality", Heng Li,
2095 Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076).
2096 
2097 The @param flag value can be generated using the htsRealnFlags enum, but for
2098 backwards compatibilty reasons is retained as an "int".  An example usage
2099 of the enum could be this, equivalent to flag 19:
2100 
2101     sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS);
2102 
2103 The following @param flag bits can be used:
2104 
2105 Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values
2106 
2107  If set, the data in the BQ:Z tag is used to adjust the quality values, and
2108  the BQ:Z tag is renamed to ZQ:Z.
2109 
2110  If clear, and a ZQ:Z tag is present, the quality values are reverted using
2111  the data in the tag, and the tag is renamed to BQ:Z.
2112 
2113 Bit 1 (BAQ_EXTEND): Use "extended" BAQ.
2114 
2115  Changes the BAQ calculation to increase sensitivity at the expense of
2116  reduced specificity.
2117 
2118 Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present.
2119 
2120  Force BAQ to be recalculated.  Note that a ZQ:Z tag will always disable
2121  recalculation.
2122 
2123 Bits 3-10: Choose parameters tailored to a specific instrument type.
2124 
2125  One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and
2126  BAQ_GENAPSYS.  The BAQ parameter tuning are still a work in progress and
2127  at the time of writing mainly consist of Illumina vs long-read technology
2128  adjustments.
2129 
2130 @bug
2131 If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed.
2132 Depending on what previous processing happened, this may or may not be the
2133 correct thing to do.  It would be wise to avoid this situation if possible.
2134 */
2135 HTSLIB_EXPORT
2136 int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag);
2137 
2138 // ---------------------------
2139 // Base modification retrieval
2140 
2141 /*! @typedef
2142  @abstract Holds a single base modification.
2143  @field modified_base     The short base code (m, h, etc) or -ChEBI (negative)
2144  @field canonical_base    The canonical base referred to in the MM tag.
2145                           One of A, C, G, T or N.  Note this may not be the
2146                           explicit base recorded in the SEQ column (esp. if N).
2147  @field stran             0 or 1, indicating + or - strand from MM tag.
2148  @field qual              Quality code (256*probability), or -1 if unknown
2149 
2150  @discussion
2151  Note this doesn't hold any location data or information on which other
2152  modifications may be possible at this site.
2153 */
2154 typedef struct hts_base_mod {
2155     int modified_base;
2156     int canonical_base;
2157     int strand;
2158     int qual;
2159 } hts_base_mod;
2160 
2161 /// Allocates an hts_base_mode_state.
2162 /**
2163  * @return An hts_base_mode_state pointer on success,
2164  *         NULL on failure.
2165  *
2166  * This just allocates the memory.  The initialisation of the contents is
2167  * done using bam_parse_basemod.  Successive calls may be made to that
2168  * without the need to free and allocate a new state.
2169  *
2170  * The state be destroyed using the hts_base_mode_state_free function.
2171  */
2172 HTSLIB_EXPORT
2173 hts_base_mod_state *hts_base_mod_state_alloc(void);
2174 
2175 /// Destroys an  hts_base_mode_state.
2176 /**
2177  * @param state    The base modification state pointer.
2178  *
2179  * The should have previously been created by hts_base_mode_state_alloc.
2180  */
2181 HTSLIB_EXPORT
2182 void hts_base_mod_state_free(hts_base_mod_state *state);
2183 
2184 /// Parses the Mm and Ml tags out of a bam record.
2185 /**
2186  * @param b        BAM alignment record
2187  * @param state    The base modification state pointer.
2188  * @return 0 on success,
2189  *         -1 on failure.
2190  *
2191  * This fills out the contents of the modification state, resetting the
2192  * iterator location to the first sequence base.
2193  */
2194 HTSLIB_EXPORT
2195 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state);
2196 
2197 /// Returns modification status for the next base position in the query seq.
2198 /**
2199  * @param b        BAM alignment record
2200  * @param state    The base modification state pointer.
2201  * @param mods     A supplied array for returning base modifications
2202  * @param n_mods   The size of the mods array
2203  * @return The number of modifications found on success,
2204  *         -1 on failure.
2205  *
2206  * This is intended to be used as an iterator, with one call per location
2207  * along the query sequence.
2208  *
2209  * If no modifications are found, the returned value is zero.
2210  * If more than n_mods modifications are found, the total found is returned.
2211  * Note this means the caller needs to check whether this is higher than
2212  * n_mods.
2213  */
2214 HTSLIB_EXPORT
2215 int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
2216                          hts_base_mod *mods, int n_mods);
2217 
2218 /// Finds the next location containing base modifications and returns them
2219 /**
2220  * @param b        BAM alignment record
2221  * @param state    The base modification state pointer.
2222  * @param mods     A supplied array for returning base modifications
2223  * @param n_mods   The size of the mods array
2224  * @return The number of modifications found on success,
2225  *         0 if no more modifications are present,
2226  *         -1 on failure.
2227  *
2228  * Unlike bam_mods_at_next_pos this skips ahead to the next site
2229  * with modifications.
2230  *
2231  * If more than n_mods modifications are found, the total found is returned.
2232  * Note this means the caller needs to check whether this is higher than
2233  * n_mods.
2234  */
2235 HTSLIB_EXPORT
2236 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
2237                      hts_base_mod *mods, int n_mods, int *pos);
2238 
2239 /// Returns modification status for a specific query position.
2240 /**
2241  * @param b        BAM alignment record
2242  * @param state    The base modification state pointer.
2243  * @param mods     A supplied array for returning base modifications
2244  * @param n_mods   The size of the mods array
2245  * @return The number of modifications found on success,
2246  *         -1 on failure.
2247  *
2248  * Note if called multipled times, qpos must be higher than the previous call.
2249  * Hence this is suitable for use from a pileup iterator.  If more random
2250  * access is required, bam_parse_basemod must be called each time to reset
2251  * the state although this has an efficiency cost.
2252  *
2253  * If no modifications are found, the returned value is zero.
2254  * If more than n_mods modifications are found, the total found is returned.
2255  * Note this means the caller needs to check whether this is higher than
2256  * n_mods.
2257  */
2258 HTSLIB_EXPORT
2259 int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
2260                      hts_base_mod *mods, int n_mods);
2261 
2262 
2263 #ifdef __cplusplus
2264 }
2265 #endif
2266 
2267 #endif
2268