1 /// @file htslib/hts.h
2 /// Format-neutral I/O, indexing, and iterator API functions.
3 /*
4     Copyright (C) 2012-2021 Genome Research Ltd.
5     Copyright (C) 2010, 2012 Broad Institute.
6     Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk>
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #ifndef HTSLIB_HTS_H
29 #define HTSLIB_HTS_H
30 
31 #include <stddef.h>
32 #include <stdint.h>
33 #include <inttypes.h>
34 
35 #include "hts_defs.h"
36 #include "hts_log.h"
37 #include "kstring.h"
38 #include "kroundup.h"
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 // Separator used to split HTS_PATH (for plugins); REF_PATH (cram references)
45 #if defined(_WIN32) || defined(__MSYS__)
46 #define HTS_PATH_SEPARATOR_CHAR ';'
47 #define HTS_PATH_SEPARATOR_STR  ";"
48 #else
49 #define HTS_PATH_SEPARATOR_CHAR ':'
50 #define HTS_PATH_SEPARATOR_STR  ":"
51 #endif
52 
53 #ifndef HTS_BGZF_TYPEDEF
54 typedef struct BGZF BGZF;
55 #define HTS_BGZF_TYPEDEF
56 #endif
57 struct cram_fd;
58 struct hFILE;
59 struct hts_tpool;
60 struct sam_hdr_t;
61 
62 /**
63  * @hideinitializer
64  * Deprecated macro to expand a dynamic array of a given type
65  *
66  * @param         type_t The type of the array elements
67  * @param[in]     n      Requested number of elements of type type_t
68  * @param[in,out] m      Size of memory allocated
69  * @param[in,out] ptr    Pointer to the array
70  *
71  * @discussion
72  * Do not use this macro.  Use hts_resize() instead as allows allocation
73  * failures to be handled more gracefully.
74  *
75  * The array *ptr will be expanded if necessary so that it can hold @p n
76  * or more elements.  If the array is expanded then the new size will be
77  * written to @p m and the value in @p ptr may change.
78  *
79  * It must be possible to take the address of @p ptr and @p m must be usable
80  * as an lvalue.
81  *
82  * @bug
83  * If the memory allocation fails, this will call exit(1).  This is
84  * not ideal behaviour in a library.
85  */
86 #define hts_expand(type_t, n, m, ptr) do {                              \
87         if ((n) > (m)) {                                                \
88             size_t hts_realloc_or_die(size_t, size_t, size_t, size_t,   \
89                                       int, void **, const char *);      \
90             (m) = hts_realloc_or_die((n) >= 1 ? (n) : 1, (m), sizeof(m), \
91                                      sizeof(type_t),  0,                \
92                                      (void **)&(ptr), __func__);        \
93         }                                                               \
94     } while (0)
95 
96 /**
97  * @hideinitializer
98  * Macro to expand a dynamic array, zeroing any newly-allocated memory
99  *
100  * @param         type_t The type of the array elements
101  * @param[in]     n      Requested number of elements of type type_t
102  * @param[in,out] m      Size of memory allocated
103  * @param[in,out] ptr    Pointer to the array
104  *
105  * @discussion
106  * Do not use this macro.  Use hts_resize() instead as allows allocation
107  * failures to be handled more gracefully.
108  *
109  * As for hts_expand(), except the bytes that make up the array elements
110  * between the old and new values of @p m are set to zero using memset().
111  *
112  * @bug
113  * If the memory allocation fails, this will call exit(1).  This is
114  * not ideal behaviour in a library.
115  */
116 
117 
118 #define hts_expand0(type_t, n, m, ptr) do {                             \
119         if ((n) > (m)) {                                                \
120             size_t hts_realloc_or_die(size_t, size_t, size_t, size_t,   \
121                                       int, void **, const char *);      \
122             (m) = hts_realloc_or_die((n) >= 1 ? (n) : 1, (m), sizeof(m), \
123                                      sizeof(type_t), 1,                 \
124                                      (void **)&(ptr), __func__);        \
125         }                                                               \
126     } while (0)
127 
128 // For internal use (by hts_resize()) only
129 HTSLIB_EXPORT
130 int hts_resize_array_(size_t, size_t, size_t, void *, void **, int,
131                       const char *);
132 
133 #define HTS_RESIZE_CLEAR 1
134 
135 /**
136  * @hideinitializer
137  * Macro to expand a dynamic array of a given type
138  *
139  * @param         type_t    The type of the array elements
140  * @param[in]     num       Requested number of elements of type type_t
141  * @param[in,out] size_ptr  Pointer to where the size (in elements) of the
142                             array is stored.
143  * @param[in,out] ptr       Location of the pointer to the array
144  * @param[in]     flags     Option flags
145  *
146  * @return        0 for success, or negative if an error occurred.
147  *
148  * @discussion
149  * The array *ptr will be expanded if necessary so that it can hold @p num
150  * or more elements.  If the array is expanded then the new size will be
151  * written to @p *size_ptr and the value in @p *ptr may change.
152  *
153  * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will
154  * be cleared.
155  */
156 
157 #define hts_resize(type_t, num, size_ptr, ptr, flags)       \
158     ((num) > (*(size_ptr))                                  \
159      ? hts_resize_array_(sizeof(type_t), (num),             \
160                          sizeof(*(size_ptr)), (size_ptr),   \
161                          (void **)(ptr), (flags), __func__) \
162      : 0)
163 
164 /// Release resources when dlclosing a dynamically loaded HTSlib
165 /** @discussion
166  *  Normally HTSlib cleans up automatically when your program exits,
167  *  whether that is via exit(3) or returning from main(). However if you
168  *  have dlopen(3)ed HTSlib and wish to close it before your main program
169  *  exits, you must call hts_lib_shutdown() before dlclose(3).
170 */
171 HTSLIB_EXPORT
172 void hts_lib_shutdown(void);
173 
174 /**
175  * Wrapper function for free(). Enables memory deallocation across DLL
176  * boundary. Should be used by all applications, which are compiled
177  * with a different standard library than htslib and call htslib
178  * methods that return dynamically allocated data.
179  */
180 HTSLIB_EXPORT
181 void hts_free(void *ptr);
182 
183 /************
184  * File I/O *
185  ************/
186 
187 // Add new entries only at the end (but before the *_maximum entry)
188 // of these enums, as their numbering is part of the htslib ABI.
189 
190 enum htsFormatCategory {
191     unknown_category,
192     sequence_data,    // Sequence data -- SAM, BAM, CRAM, etc
193     variant_data,     // Variant calling data -- VCF, BCF, etc
194     index_file,       // Index file associated with some data file
195     region_list,      // Coordinate intervals or regions -- BED, etc
196     category_maximum = 32767
197 };
198 
199 enum htsExactFormat {
200     unknown_format,
201     binary_format, text_format,
202     sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
203     htsget,
204     json HTS_DEPRECATED_ENUM("Use htsExactFormat 'htsget' instead") = htsget,
205     empty_format,  // File is empty (or empty after decompression)
206     fasta_format, fastq_format, fai_format, fqi_format,
207     hts_crypt4gh_format,
208     d4_format,
209     format_maximum = 32767
210 };
211 
212 enum htsCompression {
213     no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression,
214     xz_compression, zstd_compression,
215     compression_maximum = 32767
216 };
217 
218 typedef struct htsFormat {
219     enum htsFormatCategory category;
220     enum htsExactFormat format;
221     struct { short major, minor; } version;
222     enum htsCompression compression;
223     short compression_level;  // currently unused
224     void *specific;  // format specific options; see struct hts_opt.
225 } htsFormat;
226 
227 struct hts_idx_t;
228 typedef struct hts_idx_t hts_idx_t;
229 struct hts_filter_t;
230 
231 /**
232  * @brief File handle returned by hts_open() etc.
233  * This structure should be considered opaque by end users. There should be
234  * no need to access most fields directly in user code, and in cases where
235  * it is desirable accessor functions such as hts_get_format() are provided.
236  */
237 // Maintainers note htsFile cannot be an incomplete struct because some of its
238 // fields are part of libhts.so's ABI (hence these fields must not be moved):
239 //  - fp is used in the public sam_itr_next()/etc macros
240 //  - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
241 //  - is_write and is_cram are used directly in samtools <= 1.1
242 //  - fp is used directly in samtools (up to and including current develop)
243 //  - line is used directly in bcftools (up to and including current develop)
244 //  - is_bgzf and is_cram flags indicate which fp union member to use.
245 //    Note is_bgzf being set does not indicate the flag is BGZF compressed,
246 //    nor even whether it is compressed at all (eg on naked BAMs).
247 typedef struct htsFile {
248     uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_bgzf:1, dummy:27;
249     int64_t lineno;
250     kstring_t line;
251     char *fn, *fn_aux;
252     union {
253         BGZF *bgzf;
254         struct cram_fd *cram;
255         struct hFILE *hfile;
256     } fp;
257     void *state;  // format specific state information
258     htsFormat format;
259     hts_idx_t *idx;
260     const char *fnidx;
261     struct sam_hdr_t *bam_header;
262     struct hts_filter_t *filter;
263 } htsFile;
264 
265 // A combined thread pool and queue allocation size.
266 // The pool should already be defined, but qsize may be zero to
267 // indicate an appropriate queue size is taken from the pool.
268 //
269 // Reasons for explicitly setting it could be where many more file
270 // descriptors are in use than threads, so keeping memory low is
271 // important.
272 typedef struct htsThreadPool {
273     struct hts_tpool *pool; // The shared thread pool itself
274     int qsize;    // Size of I/O queue to use for this fp
275 } htsThreadPool;
276 
277 // REQUIRED_FIELDS
278 enum sam_fields {
279     SAM_QNAME = 0x00000001,
280     SAM_FLAG  = 0x00000002,
281     SAM_RNAME = 0x00000004,
282     SAM_POS   = 0x00000008,
283     SAM_MAPQ  = 0x00000010,
284     SAM_CIGAR = 0x00000020,
285     SAM_RNEXT = 0x00000040,
286     SAM_PNEXT = 0x00000080,
287     SAM_TLEN  = 0x00000100,
288     SAM_SEQ   = 0x00000200,
289     SAM_QUAL  = 0x00000400,
290     SAM_AUX   = 0x00000800,
291     SAM_RGAUX = 0x00001000,
292 };
293 
294 // Mostly CRAM only, but this could also include other format options
295 enum hts_fmt_option {
296     // CRAM specific
297     CRAM_OPT_DECODE_MD,
298     CRAM_OPT_PREFIX,
299     CRAM_OPT_VERBOSITY,  // obsolete, use hts_set_log_level() instead
300     CRAM_OPT_SEQS_PER_SLICE,
301     CRAM_OPT_SLICES_PER_CONTAINER,
302     CRAM_OPT_RANGE,
303     CRAM_OPT_VERSION,    // rename to cram_version?
304     CRAM_OPT_EMBED_REF,
305     CRAM_OPT_IGNORE_MD5,
306     CRAM_OPT_REFERENCE,  // make general
307     CRAM_OPT_MULTI_SEQ_PER_SLICE,
308     CRAM_OPT_NO_REF,
309     CRAM_OPT_USE_BZIP2,
310     CRAM_OPT_SHARED_REF,
311     CRAM_OPT_NTHREADS,   // deprecated, use HTS_OPT_NTHREADS
312     CRAM_OPT_THREAD_POOL,// make general
313     CRAM_OPT_USE_LZMA,
314     CRAM_OPT_USE_RANS,
315     CRAM_OPT_REQUIRED_FIELDS,
316     CRAM_OPT_LOSSY_NAMES,
317     CRAM_OPT_BASES_PER_SLICE,
318     CRAM_OPT_STORE_MD,
319     CRAM_OPT_STORE_NM,
320     CRAM_OPT_RANGE_NOSEEK, // CRAM_OPT_RANGE minus the seek
321     CRAM_OPT_USE_TOK,
322     CRAM_OPT_USE_FQZ,
323     CRAM_OPT_USE_ARITH,
324     CRAM_OPT_POS_DELTA,  // force delta for AP, even on non-pos sorted data
325 
326     // General purpose
327     HTS_OPT_COMPRESSION_LEVEL = 100,
328     HTS_OPT_NTHREADS,
329     HTS_OPT_THREAD_POOL,
330     HTS_OPT_CACHE_SIZE,
331     HTS_OPT_BLOCK_SIZE,
332     HTS_OPT_FILTER,
333     HTS_OPT_PROFILE,
334 
335     // Fastq
336 
337     // Boolean.
338     // Read / Write CASAVA 1.8 format.
339     // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf
340     //
341     // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+
342     // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag,
343     // \d+ is a control number, and the sequence at the end is
344     // for barcode sequence.  Barcodes are read into the aux tag defined
345     // by FASTQ_OPT_BARCODE ("BC" by default).
346     FASTQ_OPT_CASAVA = 1000,
347 
348     // String.
349     // Whether to read / write extra SAM format aux tags from the fastq
350     // identifier line.  For reading this can simply be "1" to request
351     // decoding aux tags.  For writing it is a comma separated list of aux
352     // tag types to be written out.
353     FASTQ_OPT_AUX,
354 
355     // Boolean.
356     // Whether to add /1 and /2 to read identifiers when writing FASTQ.
357     // These come from the BAM_FREAD1 or BAM_FREAD2 flags.
358     // (Detecting the /1 and /2 is automatic when reading fastq.)
359     FASTQ_OPT_RNUM,
360 
361     // Two character string.
362     // Barcode aux tag for CASAVA; defaults to "BC".
363     FASTQ_OPT_BARCODE,
364 
365     // Process SRA and ENA read names which pointlessly move the original
366     // name to the second field and insert a constructed <run>.<number>
367     // name in its place.
368     FASTQ_OPT_NAME2,
369 };
370 
371 // Profile options for encoding; primarily used at present in CRAM
372 // but also usable in BAM as a synonym for deflate compression levels.
373 enum hts_profile_option {
374     HTS_PROFILE_FAST,
375     HTS_PROFILE_NORMAL,
376     HTS_PROFILE_SMALL,
377     HTS_PROFILE_ARCHIVE,
378 };
379 
380 // For backwards compatibility
381 #define cram_option hts_fmt_option
382 
383 typedef struct hts_opt {
384     char *arg;                // string form, strdup()ed
385     enum hts_fmt_option opt;  // tokenised key
386     union {                   // ... and value
387         int i;
388         char *s;
389     } val;
390     struct hts_opt *next;
391 } hts_opt;
392 
393 #define HTS_FILE_OPTS_INIT {{0},0}
394 
395 /*
396  * Explicit index file name delimiter, see below
397  */
398 #define HTS_IDX_DELIM "##idx##"
399 
400 
401 /**********************
402  * Exported functions *
403  **********************/
404 
405 /*
406  * Parses arg and appends it to the option list.
407  *
408  * Returns 0 on success;
409  *        -1 on failure.
410  */
411 HTSLIB_EXPORT
412 int hts_opt_add(hts_opt **opts, const char *c_arg);
413 
414 /*
415  * Applies an hts_opt option list to a given htsFile.
416  *
417  * Returns 0 on success
418  *        -1 on failure
419  */
420 HTSLIB_EXPORT
421 int hts_opt_apply(htsFile *fp, hts_opt *opts);
422 
423 /*
424  * Frees an hts_opt list.
425  */
426 HTSLIB_EXPORT
427 void hts_opt_free(hts_opt *opts);
428 
429 /*
430  * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
431  * followed by a comma separated list of key=value options and splits
432  * these up into the fields of htsFormat struct.
433  *
434  * Returns 0 on success
435  *        -1 on failure.
436  */
437 HTSLIB_EXPORT
438 int hts_parse_format(htsFormat *opt, const char *str);
439 
440 /*
441  * Tokenise options as (key(=value)?,)*(key(=value)?)?
442  * NB: No provision for ',' appearing in the value!
443  * Add backslashing rules?
444  *
445  * This could be used as part of a general command line option parser or
446  * as a string concatenated onto the file open mode.
447  *
448  * Returns 0 on success
449  *        -1 on failure.
450  */
451 HTSLIB_EXPORT
452 int hts_parse_opt_list(htsFormat *opt, const char *str);
453 
454 /*! @abstract Table for converting a nucleotide character to 4-bit encoding.
455 The input character may be either an IUPAC ambiguity code, '=' for 0, or
456 '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
457 for A/C/G/T or combinations of these bits for ambiguous bases.
458 */
459 extern const unsigned char seq_nt16_table[256];
460 
461 /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
462 ambiguity code letter (or '=' when given 0).
463 */
464 extern const char seq_nt16_str[];
465 
466 /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
467 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
468 */
469 extern const int seq_nt16_int[];
470 
471 /*!
472   @abstract  Get the htslib version number
473   @return    For released versions, a string like "N.N[.N]"; or git describe
474   output if using a library built within a Git repository.
475 */
476 HTSLIB_EXPORT
477 const char *hts_version(void);
478 
479 /*!
480   @abstract  Compile-time HTSlib version number, for use in #if checks
481   @return    For released versions X.Y[.Z], an integer of the form XYYYZZ;
482   useful for preprocessor conditionals such as
483       #if HTS_VERSION >= 101000  // Check for v1.10 or later
484 */
485 // Maintainers: Bump this in the final stage of preparing a new release.
486 // Immediately after release, bump ZZ to 90 to distinguish in-development
487 // Git repository builds from the release; you may wish to increment this
488 // further when significant features are merged.
489 #define HTS_VERSION 101400
490 
491 /*! @abstract Introspection on the features enabled in htslib
492  *
493  * @return a bitfield of HTS_FEATURE_* macros.
494  */
495 HTSLIB_EXPORT
496 unsigned int hts_features(void);
497 
498 HTSLIB_EXPORT
499 const char *hts_test_feature(unsigned int id);
500 
501 /*! @abstract Introspection on the features enabled in htslib, string form
502  *
503  * @return a string describing htslib build features
504  */
505 HTSLIB_EXPORT
506 const char *hts_feature_string(void);
507 
508 // Whether ./configure was used or vanilla Makefile
509 #define HTS_FEATURE_CONFIGURE    1
510 
511 // Whether --enable-plugins was used
512 #define HTS_FEATURE_PLUGINS      2
513 
514 // Transport specific
515 #define HTS_FEATURE_LIBCURL      (1u<<10)
516 #define HTS_FEATURE_S3           (1u<<11)
517 #define HTS_FEATURE_GCS          (1u<<12)
518 
519 // Compression options
520 #define HTS_FEATURE_LIBDEFLATE   (1u<<20)
521 #define HTS_FEATURE_LZMA         (1u<<21)
522 #define HTS_FEATURE_BZIP2        (1u<<22)
523 #define HTS_FEATURE_HTSCODECS    (1u<<23) // htscodecs library version
524 
525 // Build params
526 #define HTS_FEATURE_CC           (1u<<27)
527 #define HTS_FEATURE_CFLAGS       (1u<<28)
528 #define HTS_FEATURE_CPPFLAGS     (1u<<29)
529 #define HTS_FEATURE_LDFLAGS      (1u<<30)
530 
531 
532 /*!
533   @abstract    Determine format by peeking at the start of a file
534   @param fp    File opened for reading, positioned at the beginning
535   @param fmt   Format structure that will be filled out on return
536   @return      0 for success, or negative if an error occurred.
537 */
538 HTSLIB_EXPORT
539 int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
540 
541 /*!
542   @abstract    Get a human-readable description of the file format
543   @param fmt   Format structure holding type, version, compression, etc.
544   @return      Description string, to be freed by the caller after use.
545 */
546 HTSLIB_EXPORT
547 char *hts_format_description(const htsFormat *format);
548 
549 /*!
550   @abstract       Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF)
551                   or possibly-compressed textual line-orientated file
552   @param fn       The file name or "-" for stdin/stdout. For indexed files
553                   with a non-standard naming, the file name can include the
554                   name of the index file delimited with HTS_IDX_DELIM
555   @param mode     Mode matching / [rwa][bcefFguxz0-9]* /
556   @discussion
557       With 'r' opens for reading; any further format mode letters are ignored
558       as the format is detected by checking the first few bytes or BGZF blocks
559       of the file.  With 'w' or 'a' opens for writing or appending, with format
560       specifier letters:
561         b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
562         c  CRAM format
563         f  FASTQ format
564         F  FASTA format
565         g  gzip compressed
566         u  uncompressed
567         z  bgzf compressed
568         [0-9]  zlib compression level
569       and with non-format option letters (for any of 'r'/'w'/'a'):
570         e  close the file on exec(2) (opens with O_CLOEXEC, where supported)
571         x  create the file exclusively (opens with O_EXCL, where supported)
572       Note that there is a distinction between 'u' and '0': the first yields
573       plain uncompressed output whereas the latter outputs uncompressed data
574       wrapped in the zlib format.
575   @example
576       [rw]b  .. compressed BCF, BAM, FAI
577       [rw]bu .. uncompressed BCF
578       [rw]z  .. compressed VCF
579       [rw]   .. uncompressed VCF
580 */
581 HTSLIB_EXPORT
582 htsFile *hts_open(const char *fn, const char *mode);
583 
584 /*!
585   @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
586   @param fn       The file name or "-" for stdin/stdout
587   @param mode     Open mode, as per hts_open()
588   @param fmt      Optional format specific parameters
589   @discussion
590       See hts_open() for description of fn and mode.
591       // TODO Update documentation for s/opts/fmt/
592       Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
593       if defined, override mode.  Opts also contains a linked list of hts_opt
594       structures to apply to the open file handle.  These can contain things
595       like pointers to the reference or information on compression levels,
596       block sizes, etc.
597 */
598 HTSLIB_EXPORT
599 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt);
600 
601 /*!
602   @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
603   @param fn       The already-open file handle
604   @param mode     Open mode, as per hts_open()
605 */
606 HTSLIB_EXPORT
607 htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
608 
609 /*!
610   @abstract  For output streams, flush any buffered data
611   @param fp  The file handle to be flushed
612   @return    0 for success, or negative if an error occurred.
613   @since     1.14
614 */
615 HTSLIB_EXPORT
616 int hts_flush(htsFile *fp);
617 
618 /*!
619   @abstract  Close a file handle, flushing buffered data for output streams
620   @param fp  The file handle to be closed
621   @return    0 for success, or negative if an error occurred.
622 */
623 HTSLIB_EXPORT
624 int hts_close(htsFile *fp);
625 
626 /*!
627   @abstract  Returns the file's format information
628   @param fp  The file handle
629   @return    Read-only pointer to the file's htsFormat.
630 */
631 HTSLIB_EXPORT
632 const htsFormat *hts_get_format(htsFile *fp);
633 
634 /*!
635   @ abstract      Returns a string containing the file format extension.
636   @ param format  Format structure containing the file type.
637   @ return        A string ("sam", "bam", etc) or "?" for unknown formats.
638  */
639 HTSLIB_EXPORT
640 const char *hts_format_file_extension(const htsFormat *format);
641 
642 /*!
643   @abstract  Sets a specified CRAM option on the open file handle.
644   @param fp  The file handle open the open file.
645   @param opt The CRAM_OPT_* option.
646   @param ... Optional arguments, dependent on the option used.
647   @return    0 for success, or negative if an error occurred.
648 */
649 HTSLIB_EXPORT
650 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...);
651 
652 /*!
653   @abstract         Read a line (and its \n or \r\n terminator) from a file
654   @param fp         The file handle
655   @param delimiter  Unused, but must be '\n' (or KS_SEP_LINE)
656   @param str        The line (not including the terminator) is written here
657   @return           Length of the string read;
658                     -1 on end-of-file; <= -2 on error
659 */
660 HTSLIB_EXPORT
661 int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
662 
663 HTSLIB_EXPORT
664 char **hts_readlines(const char *fn, int *_n);
665 /*!
666     @abstract       Parse comma-separated list or read list from a file
667     @param list     File name or comma-separated list
668     @param is_file
669     @param _n       Size of the output array (number of items read)
670     @return         NULL on failure or pointer to newly allocated array of
671                     strings
672 */
673 HTSLIB_EXPORT
674 char **hts_readlist(const char *fn, int is_file, int *_n);
675 
676 /*!
677   @abstract  Create extra threads to aid compress/decompression for this file
678   @param fp  The file handle
679   @param n   The number of worker threads to create
680   @return    0 for success, or negative if an error occurred.
681   @notes     This function creates non-shared threads for use solely by fp.
682              The hts_set_thread_pool function is the recommended alternative.
683 */
684 HTSLIB_EXPORT
685 int hts_set_threads(htsFile *fp, int n);
686 
687 /*!
688   @abstract  Create extra threads to aid compress/decompression for this file
689   @param fp  The file handle
690   @param p   A pool of worker threads, previously allocated by hts_create_threads().
691   @return    0 for success, or negative if an error occurred.
692 */
693 HTSLIB_EXPORT
694 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p);
695 
696 /*!
697   @abstract  Adds a cache of decompressed blocks, potentially speeding up seeks.
698              This may not work for all file types (currently it is bgzf only).
699   @param fp  The file handle
700   @param n   The size of cache, in bytes
701 */
702 HTSLIB_EXPORT
703 void hts_set_cache_size(htsFile *fp, int n);
704 
705 /*!
706   @abstract  Set .fai filename for a file opened for reading
707   @return    0 for success, negative on failure
708   @discussion
709       Called before *_hdr_read(), this provides the name of a .fai file
710       used to provide a reference list if the htsFile contains no @SQ headers.
711 */
712 HTSLIB_EXPORT
713 int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
714 
715 
716 /*!
717   @abstract  Sets a filter expression
718   @return    0 for success, negative on failure
719   @discussion
720       To clear an existing filter, specifying expr as NULL.
721 */
722 HTSLIB_EXPORT
723 int hts_set_filter_expression(htsFile *fp, const char *expr);
724 
725 /*!
726   @abstract  Determine whether a given htsFile contains a valid EOF block
727   @return    3 for a non-EOF checkable filetype;
728              2 for an unseekable file type where EOF cannot be checked;
729              1 for a valid EOF block;
730              0 for if the EOF marker is absent when it should be present;
731             -1 (with errno set) on failure
732   @discussion
733       Check if the BGZF end-of-file (EOF) marker is present
734 */
735 HTSLIB_EXPORT
736 int hts_check_EOF(htsFile *fp);
737 
738 /************
739  * Indexing *
740  ************/
741 
742 /*!
743 These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
744 producing iterators operating as follows:
745  - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
746  - HTS_IDX_START  iterates over the entire file
747  - HTS_IDX_REST   iterates from the current position to the end of the file
748  - HTS_IDX_NONE   always returns "no more alignment records"
749 When one of these special tid values is used, beg and end are ignored.
750 When REST or NONE is used, idx is also ignored and may be NULL.
751 */
752 #define HTS_IDX_NOCOOR (-2)
753 #define HTS_IDX_START  (-3)
754 #define HTS_IDX_REST   (-4)
755 #define HTS_IDX_NONE   (-5)
756 
757 #define HTS_FMT_CSI 0
758 #define HTS_FMT_BAI 1
759 #define HTS_FMT_TBI 2
760 #define HTS_FMT_CRAI 3
761 #define HTS_FMT_FAI 4
762 
763 // Almost INT64_MAX, but when cast into a 32-bit int it's
764 // also INT_MAX instead of -1.  This avoids bugs with old code
765 // using the new hts_pos_t data type.
766 #define HTS_POS_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX)
767 #define HTS_POS_MIN INT64_MIN
768 #define PRIhts_pos PRId64
769 typedef int64_t hts_pos_t;
770 
771 // For comparison with previous release:
772 //
773 // #define HTS_POS_MAX INT_MAX
774 // #define HTS_POS_MIN INT_MIN
775 // #define PRIhts_pos PRId32
776 // typedef int32_t hts_pos_t;
777 
778 typedef struct hts_pair_pos_t {
779    hts_pos_t beg, end;
780 } hts_pair_pos_t;
781 
782 typedef hts_pair_pos_t hts_pair32_t;  // For backwards compatibility
783 
784 typedef struct hts_pair64_t {
785     uint64_t u, v;
786 } hts_pair64_t;
787 
788 typedef struct hts_pair64_max_t {
789     uint64_t u, v;
790     uint64_t max;
791 } hts_pair64_max_t;
792 
793 typedef struct hts_reglist_t {
794     const char *reg;
795     hts_pair_pos_t *intervals;
796     int tid;
797     uint32_t count;
798     hts_pos_t min_beg, max_end;
799 } hts_reglist_t;
800 
801 typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end);
802 typedef int hts_seek_func(void *fp, int64_t offset, int where);
803 typedef int64_t hts_tell_func(void *fp);
804 
805 /**
806  * @brief File iterator that can handle multiple target regions.
807  * This structure should be considered opaque by end users.
808  * It does both the stepping inside the file and the filtering of alignments.
809  * It can operate in single or multi-region mode, and depending on this,
810  * it uses different fields.
811  *
812  * read_rest (1) - read everything from the current offset, without filtering
813  * finished  (1) - no more iterations
814  * is_cram   (1) - current file has CRAM format
815  * nocoor    (1) - read all unmapped reads
816  *
817  * multi     (1) - multi-region moode
818  * reg_list  - List of target regions
819  * n_reg     - Size of the above list
820  * curr_reg  - List index of the current region of search
821  * curr_intv - Interval index inside the current region; points to a (beg, end)
822  * end       - Used for CRAM files, to preserve the max end coordinate
823  *
824  * multi     (0) - single-region mode
825  * tid       - Reference id of the target region
826  * beg       - Start position of the target region
827  * end       - End position of the target region
828  *
829  * Common fields:
830  * off        - List of file offsets computed from the index
831  * n_off      - Size of the above list
832  * i          - List index of the current file offset
833  * curr_off   - File offset for the next file read
834  * curr_tid   - Reference id of the current alignment
835  * curr_beg   - Start position of the current alignment
836  * curr_end   - End position of the current alignment
837  * nocoor_off - File offset where the unmapped reads start
838  *
839  * readrec    - File specific function that reads an alignment
840  * seek       - File specific function for changing the file offset
841  * tell       - File specific function for indicating the file offset
842  */
843 
844 typedef struct hts_itr_t {
845     uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27;
846     int tid, n_off, i, n_reg;
847     hts_pos_t beg, end;
848     hts_reglist_t *reg_list;
849     int curr_tid, curr_reg, curr_intv;
850     hts_pos_t curr_beg, curr_end;
851     uint64_t curr_off, nocoor_off;
852     hts_pair64_max_t *off;
853     hts_readrec_func *readrec;
854     hts_seek_func *seek;
855     hts_tell_func *tell;
856     struct {
857         int n, m;
858         int *a;
859     } bins;
860 } hts_itr_t;
861 
862 typedef hts_itr_t hts_itr_multi_t;
863 
864 /// Compute the first bin on a given level
865 #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
866 /// Compute the parent bin of a given bin
867 #define hts_bin_parent(b) (((b) - 1) >> 3)
868 
869 ///////////////////////////////////////////////////////////
870 // Low-level API for building indexes.
871 
872 /// Create a BAI/CSI/TBI type index structure
873 /** @param n          Initial number of targets
874     @param fmt        Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
875     @param offset0    Initial file offset
876     @param min_shift  Number of bits for the minimal interval
877     @param n_lvls     Number of levels in the binning index
878     @return An initialised hts_idx_t struct on success; NULL on failure
879 
880 The struct returned by a successful call should be freed via hts_idx_destroy()
881 when it is no longer needed.
882 */
883 HTSLIB_EXPORT
884 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
885 
886 /// Free a BAI/CSI/TBI type index
887 /** @param idx   Index structure to free
888  */
889 HTSLIB_EXPORT
890 void hts_idx_destroy(hts_idx_t *idx);
891 
892 /// Push an index entry
893 /** @param idx        Index
894     @param tid        Target id
895     @param beg        Range start (zero-based)
896     @param end        Range end (zero-based, half-open)
897     @param offset     File offset
898     @param is_mapped  Range corresponds to a mapped read
899     @return 0 on success; -1 on failure
900 
901 The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts
902 stored in the meta-data bin.
903  */
904 HTSLIB_EXPORT
905 int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);
906 
907 /// Finish building an index
908 /** @param idx          Index
909     @param final_offset Last file offset
910     @return 0 on success; non-zero on failure.
911 */
912 HTSLIB_EXPORT
913 int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
914 
915 /// Returns index format
916 /** @param idx   Index
917     @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
918 */
919 HTSLIB_EXPORT
920 int hts_idx_fmt(hts_idx_t *idx);
921 
922 /// Add name to TBI index meta-data
923 /** @param idx   Index
924     @param tid   Target identifier
925     @param name  Target name
926     @return Index number of name in names list on success; -1 on failure.
927 */
928 HTSLIB_EXPORT
929 int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name);
930 
931 // Index loading and saving
932 
933 /// Save an index to a file
934 /** @param idx  Index to be written
935     @param fn   Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
936     @param fmt  One of the HTS_FMT_* index formats
937     @return  0 if successful, or negative if an error occurred.
938 */
939 HTSLIB_EXPORT
940 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) HTS_RESULT_USED;
941 
942 /// Save an index to a specific file
943 /** @param idx    Index to be written
944     @param fn     Input BAM/BCF/etc filename
945     @param fnidx  Output filename, or NULL to add .bai/.csi/etc to @a fn
946     @param fmt    One of the HTS_FMT_* index formats
947     @return  0 if successful, or negative if an error occurred.
948 */
949 HTSLIB_EXPORT
950 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) HTS_RESULT_USED;
951 
952 /// Load an index file
953 /** @param fn   BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
954                 the extension substituted, to search for an existing index file.
955                 In case of a non-standard naming, the file name can include the
956                 name of the index file delimited with HTS_IDX_DELIM.
957     @param fmt  One of the HTS_FMT_* index formats
958     @return  The index, or NULL if an error occurred.
959 
960 If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before
961 the delimiter will be used as the name of the data file and the part after
962 it will be used as the name of the index.
963 
964 Otherwise, this function tries to work out the index name as follows:
965 
966   It will try appending ".csi" to @p fn
967   It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi"
968   Then, if @p fmt is HTS_FMT_BAI:
969     It will try appending ".bai" to @p fn
970     To will substituting the existing suffix (e.g. .bam) with ".bai"
971   else if @p fmt is HTS_FMT_TBI:
972     It will try appending ".tbi" to @p fn
973     To will substituting the existing suffix (e.g. .vcf) with ".tbi"
974 
975 If the index file is remote (served over a protocol like https), first a check
976 is made to see is a locally cached copy is available.  This is done for all
977 of the possible names listed above.  If a cached copy is not available then
978 the index will be downloaded and stored in the current working directory,
979 with the same name as the remote index.
980 
981     Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE);
982 */
983 HTSLIB_EXPORT
984 hts_idx_t *hts_idx_load(const char *fn, int fmt);
985 
986 /// Load a specific index file
987 /** @param fn     Input BAM/BCF/etc filename
988     @param fnidx  The input index filename
989     @return  The index, or NULL if an error occurred.
990 
991     Equivalent to hts_idx_load3(fn, fnidx, 0, 0);
992 
993     This function will not attempt to save index files locally.
994 */
995 HTSLIB_EXPORT
996 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx);
997 
998 /// Load a specific index file
999 /** @param fn     Input BAM/BCF/etc filename
1000     @param fnidx  The input index filename
1001     @param fmt    One of the HTS_FMT_* index formats
1002     @param flags  Flags to alter behaviour (see description)
1003     @return  The index, or NULL if an error occurred.
1004 
1005     If @p fnidx is NULL, the index name will be derived from @p fn in the
1006     same way as hts_idx_load().
1007 
1008     If @p fnidx is not NULL, @p fmt is ignored.
1009 
1010     The @p flags parameter can be set to a combination of the following
1011     values:
1012 
1013         HTS_IDX_SAVE_REMOTE   Save a local copy of any remote indexes
1014         HTS_IDX_SILENT_FAIL   Fail silently if the index is not present
1015 
1016     The index struct returned by a successful call should be freed
1017     via hts_idx_destroy() when it is no longer needed.
1018 */
1019 HTSLIB_EXPORT
1020 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags);
1021 
1022 /// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() )
1023 #define HTS_IDX_SAVE_REMOTE 1
1024 #define HTS_IDX_SILENT_FAIL 2
1025 
1026 ///////////////////////////////////////////////////////////
1027 // Functions for accessing meta-data stored in indexes
1028 
1029 typedef const char *(*hts_id2name_f)(void*, int);
1030 
1031 /// Get extra index meta-data
1032 /** @param idx    The index
1033     @param l_meta Pointer to where the length of the extra data is stored
1034     @return Pointer to the extra data if present; NULL otherwise
1035 
1036     Indexes (both .tbi and .csi) made by tabix include extra data about
1037     the indexed file.  The returns a pointer to this data.  Note that the
1038     data is stored exactly as it is in the index.  Callers need to interpret
1039     the results themselves, including knowing what sort of data to expect;
1040     byte swapping etc.
1041 */
1042 HTSLIB_EXPORT
1043 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta);
1044 
1045 /// Set extra index meta-data
1046 /** @param idx     The index
1047     @param l_meta  Length of data
1048     @param meta    Pointer to the extra data
1049     @param is_copy If not zero, a copy of the data is taken
1050     @return 0 on success; -1 on failure (out of memory).
1051 
1052     Sets the data that is returned by hts_idx_get_meta().
1053 
1054     If is_copy != 0, a copy of the input data is taken.  If not, ownership of
1055     the data pointed to by *meta passes to the index.
1056 */
1057 HTSLIB_EXPORT
1058 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy);
1059 
1060 /// Get number of mapped and unmapped reads from an index
1061 /** @param      idx      Index
1062     @param      tid      Target ID
1063     @param[out] mapped   Location to store number of mapped reads
1064     @param[out] unmapped Location to store number of unmapped reads
1065     @return 0 on success; -1 on failure (data not available)
1066 
1067     BAI and CSI indexes store information on the number of reads for each
1068     target that were mapped or unmapped (unmapped reads will generally have
1069     a paired read that is mapped to the target).  This function returns this
1070     information if it is available.
1071 
1072     @note Cram CRAI indexes do not include this information.
1073 */
1074 HTSLIB_EXPORT
1075 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
1076 
1077 /// Return the number of unplaced reads from an index
1078 /** @param idx    Index
1079     @return Unplaced reads count
1080 
1081     Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM
1082     files).
1083 */
1084 HTSLIB_EXPORT
1085 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
1086 
1087 /// Return a list of target names from an index
1088 /** @param      idx    Index
1089     @param[out] n      Location to store the number of targets
1090     @param      getid  Callback function to get the name for a target ID
1091     @param      hdr    Header from indexed file
1092     @return An array of pointers to the names on success; NULL on failure
1093 
1094     @note The names are pointers into the header data structure.  When cleaning
1095     up, only the array should be freed, not the names.
1096  */
1097 HTSLIB_EXPORT
1098 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
1099 
1100 /// Return the number of targets from an index
1101 /** @param      idx    Index
1102     @return The number of targets
1103  */
1104 HTSLIB_EXPORT
1105 int hts_idx_nseq(const hts_idx_t *idx);
1106 
1107 ///////////////////////////////////////////////////////////
1108 // Region parsing
1109 
1110 #define HTS_PARSE_THOUSANDS_SEP 1  ///< Ignore ',' separators within numbers
1111 #define HTS_PARSE_ONE_COORD     2  ///< chr:pos means chr:pos-pos and not chr:pos-end
1112 #define HTS_PARSE_LIST          4  ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP)
1113 
1114 /// Parse a numeric string
1115 /** The number may be expressed in scientific notation, and optionally may
1116     contain commas in the integer part (before any decimal point or E notation).
1117     @param str     String to be parsed
1118     @param strend  If non-NULL, set on return to point to the first character
1119                    in @a str after those forming the parsed number
1120     @param flags   Or'ed-together combination of HTS_PARSE_* flags
1121     @return  Converted value of the parsed number.
1122 
1123     When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING
1124     or more) if there are any trailing characters after the number.
1125 */
1126 HTSLIB_EXPORT
1127 long long hts_parse_decimal(const char *str, char **strend, int flags);
1128 
1129 typedef int (*hts_name2id_f)(void*, const char*);
1130 
1131 /// Parse a "CHR:START-END"-style region string
1132 /** @param str  String to be parsed
1133     @param beg  Set on return to the 0-based start of the region
1134     @param end  Set on return to the 1-based end of the region
1135     @return  Pointer to the colon or '\0' after the reference sequence name,
1136              or NULL if @a str could not be parsed.
1137 
1138     NOTE: For compatibility with hts_parse_reg only.
1139     Please use hts_parse_region instead.
1140 */
1141 HTSLIB_EXPORT
1142 const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end);
1143 
1144 /// Parse a "CHR:START-END"-style region string
1145 /** @param str  String to be parsed
1146     @param beg  Set on return to the 0-based start of the region
1147     @param end  Set on return to the 1-based end of the region
1148     @return  Pointer to the colon or '\0' after the reference sequence name,
1149              or NULL if @a str could not be parsed.
1150 */
1151 HTSLIB_EXPORT
1152 const char *hts_parse_reg(const char *str, int *beg, int *end);
1153 
1154 /// Parse a "CHR:START-END"-style region string
1155 /** @param str   String to be parsed
1156     @param tid   Set on return (if not NULL) to be reference index (-1 if invalid)
1157     @param beg   Set on return to the 0-based start of the region
1158     @param end   Set on return to the 1-based end of the region
1159     @param getid Function pointer.  Called if not NULL to set tid.
1160     @param hdr   Caller data passed to getid.
1161     @param flags Bitwise HTS_PARSE_* flags listed above.
1162     @return      Pointer to the byte after the end of the entire region
1163                  specifier (including any trailing comma) on success,
1164                  or NULL if @a str could not be parsed.
1165 
1166     A variant of hts_parse_reg which is reference-id aware.  It uses
1167     the iterator name2id callbacks to validate the region tokenisation works.
1168 
1169     This is necessary due to GRCh38 HLA additions which have reference names
1170     like "HLA-DRB1*12:17".
1171 
1172     To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
1173     are reference names, quote using curly braces.
1174     Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
1175 
1176     Flags are used to control how parsing works, and can be one of the below.
1177 
1178     HTS_PARSE_THOUSANDS_SEP:
1179         Ignore commas in numbers.  For example with this flag 1,234,567
1180         is interpreted as 1234567.
1181 
1182     HTS_PARSE_LIST:
1183         If present, the region is assmed to be a comma separated list and
1184         position parsing will not contain commas (this implicitly
1185         clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
1186         On success the return pointer will be the start of the next region, ie
1187         the character after the comma.  (If *ret != '\0' then the caller can
1188         assume another region is present in the list.)
1189 
1190         If not set then positions may contain commas.  In this case the return
1191         value should point to the end of the string, or NULL on failure.
1192 
1193     HTS_PARSE_ONE_COORD:
1194         If present, X:100 is treated as the single base pair region X:100-100.
1195         In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
1196         (This is the standard bcftools region convention.)
1197 
1198         When not set X:100 is considered to be X:100-<end> where <end> is
1199         the end of chromosome X (set to INT_MAX here).  X:100- and X:-100 are
1200         invalid.
1201         (This is the standard samtools region convention.)
1202 
1203     Note the supplied string expects 1 based inclusive coordinates, but the
1204     returned coordinates start from 0 and are half open, so pos0 is valid
1205     for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
1206 
1207     If NULL is returned, the value in tid mat give additional information
1208     about the error:
1209 
1210         -2   Failed to parse @p hdr; or out of memory
1211         -1   The reference in @p str has mismatched braces, or does not
1212              exist in @p hdr
1213         >= 0 The specified range in @p str could not be parsed
1214 */
1215 HTSLIB_EXPORT
1216 const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
1217                              hts_pos_t *end, hts_name2id_f getid, void *hdr,
1218                              int flags);
1219 
1220 
1221 ///////////////////////////////////////////////////////////
1222 // Generic iterators
1223 //
1224 // These functions provide the low-level infrastructure for iterators.
1225 // Wrappers around these are used to make iterators for specific file types.
1226 // See:
1227 //     htslib/sam.h  for SAM/BAM/CRAM iterators
1228 //     htslib/vcf.h  for VCF/BCF iterators
1229 //     htslib/tbx.h  for files indexed by tabix
1230 
1231 /// Create a single-region iterator
1232 /** @param idx      Index
1233     @param tid      Target ID
1234     @param beg      Start of region
1235     @param end      End of region
1236     @param readrec  Callback to read a record from the input file
1237     @return An iterator on success; NULL on failure
1238 
1239     The iterator struct returned by a successful call should be freed
1240     via hts_itr_destroy() when it is no longer needed.
1241  */
1242 HTSLIB_EXPORT
1243 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec);
1244 
1245 /// Free an iterator
1246 /** @param iter   Iterator to free
1247  */
1248 HTSLIB_EXPORT
1249 void hts_itr_destroy(hts_itr_t *iter);
1250 
1251 typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec);
1252 
1253 /// Create a single-region iterator from a text region specification
1254 /** @param idx       Index
1255     @param reg       Region specifier
1256     @param getid     Callback function to return the target ID for a name
1257     @param hdr       Input file header
1258     @param itr_query Callback function returning an iterator for a numeric tid,
1259                      start and end position
1260     @param readrec   Callback to read a record from the input file
1261     @return An iterator on success; NULL on error
1262 
1263     The iterator struct returned by a successful call should be freed
1264     via hts_itr_destroy() when it is no longer needed.
1265  */
1266 HTSLIB_EXPORT
1267 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
1268 
1269 /// Return the next record from an iterator
1270 /** @param fp      Input file handle
1271     @param iter    Iterator
1272     @param r       Pointer to record placeholder
1273     @param data    Data passed to the readrec callback
1274     @return >= 0 on success, -1 when there is no more data, < -1 on error
1275  */
1276 HTSLIB_EXPORT
1277 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED;
1278 
1279 /**********************************
1280  * Iterator with multiple regions *
1281  **********************************/
1282 
1283 typedef int hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_t *itr);
1284 HTSLIB_EXPORT
1285 int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter);
1286 HTSLIB_EXPORT
1287 int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter);
1288 
1289 /// Create a multi-region iterator from a region list
1290 /** @param idx          Index
1291     @param reglist      Region list
1292     @param count        Number of items in region list
1293     @param getid        Callback to convert names to target IDs
1294     @param hdr          Indexed file header (passed to getid)
1295     @param itr_specific Filetype-specific callback function
1296     @param readrec      Callback to read an input file record
1297     @param seek         Callback to seek in the input file
1298     @param tell         Callback to return current input file location
1299     @return An iterator on success; NULL on failure
1300 
1301     The iterator struct returned by a successful call should be freed
1302     via hts_itr_destroy() when it is no longer needed.
1303  */
1304 HTSLIB_EXPORT
1305 hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell);
1306 
1307 /// Return the next record from an iterator
1308 /** @param fp      Input file handle
1309     @param iter    Iterator
1310     @param r       Pointer to record placeholder
1311     @return >= 0 on success, -1 when there is no more data, < -1 on error
1312  */
1313 HTSLIB_EXPORT
1314 int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r);
1315 
1316 /// Create a region list from a char array
1317 /** @param argv      Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2
1318     @param argc      Number of items in the array
1319     @param r_count   Pointer to the number of items in the resulting region list
1320     @param hdr       Header for the sam/bam/cram file
1321     @param getid     Callback to convert target names to target ids.
1322     @return  A region list on success, NULL on failure
1323 
1324     The hts_reglist_t struct returned by a successful call should be freed
1325     via hts_reglist_free() when it is no longer needed.
1326  */
1327 HTSLIB_EXPORT
1328 hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr,  hts_name2id_f getid);
1329 
1330 /// Free a region list
1331 /** @param reglist    Region list
1332     @param count      Number of items in the list
1333  */
1334 HTSLIB_EXPORT
1335 void hts_reglist_free(hts_reglist_t *reglist, int count);
1336 
1337 /// Free a multi-region iterator
1338 /** @param iter   Iterator to free
1339  */
1340 #define hts_itr_multi_destroy(iter) hts_itr_destroy(iter)
1341 
1342 
1343     /**
1344      * hts_file_type() - Convenience function to determine file type
1345      * DEPRECATED:  This function has been replaced by hts_detect_format().
1346      * It and these FT_* macros will be removed in a future HTSlib release.
1347      */
1348     #define FT_UNKN   0
1349     #define FT_GZ     1
1350     #define FT_VCF    2
1351     #define FT_VCF_GZ (FT_GZ|FT_VCF)
1352     #define FT_BCF    (1<<2)
1353     #define FT_BCF_GZ (FT_GZ|FT_BCF)
1354     #define FT_STDIN  (1<<3)
1355     HTSLIB_EXPORT
1356     int hts_file_type(const char *fname);
1357 
1358 
1359 /***************************
1360  * Revised MAQ error model *
1361  ***************************/
1362 
1363 struct errmod_t;
1364 typedef struct errmod_t errmod_t;
1365 
1366 HTSLIB_EXPORT
1367 errmod_t *errmod_init(double depcorr);
1368 HTSLIB_EXPORT
1369 void errmod_destroy(errmod_t *em);
1370 
1371 /*
1372     n: number of bases
1373     m: maximum base
1374     bases[i]: qual:6, strand:1, base:4
1375     q[i*m+j]: phred-scaled likelihood of (i,j)
1376  */
1377 HTSLIB_EXPORT
1378 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
1379 
1380 
1381 /*****************************************************
1382  * Probabilistic banded glocal alignment             *
1383  * See https://doi.org/10.1093/bioinformatics/btr076 *
1384  *****************************************************/
1385 
1386 typedef struct probaln_par_t {
1387     float d, e;
1388     int bw;
1389 } probaln_par_t;
1390 
1391 /// Perform probabilistic banded glocal alignment
1392 /** @param      ref     Reference sequence
1393     @param      l_ref   Length of reference
1394     @param      query   Query sequence
1395     @param      l_query Length of query sequence
1396     @param      iqual   Query base qualities
1397     @param      c       Alignment parameters
1398     @param[out] state   Output alignment
1399     @param[out] q    Phred scaled posterior probability of state[i] being wrong
1400     @return     Phred-scaled likelihood score, or INT_MIN on failure.
1401 
1402 The reference and query sequences are coded using integers 0,1,2,3,4 for
1403 bases A,C,G,T,N respectively (N here is for any ambiguity code).
1404 
1405 On output, state and q are arrays of length l_query. The higher 30
1406 bits give the reference position the query base is matched to and the
1407 lower two bits can be 0 (an alignment match) or 1 (an
1408 insertion). q[i] gives the phred scaled posterior probability of
1409 state[i] being wrong.
1410 
1411 On failure, errno will be set to EINVAL if the values of l_ref or l_query
1412 were invalid; or ENOMEM if a memory allocation failed.
1413 */
1414 
1415 HTSLIB_EXPORT
1416 int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q);
1417 
1418 
1419     /**********************
1420      * MD5 implementation *
1421      **********************/
1422 
1423     struct hts_md5_context;
1424     typedef struct hts_md5_context hts_md5_context;
1425 
1426     /*! @abstract   Initialises an MD5 context.
1427      *  @discussion
1428      *    The expected use is to allocate an hts_md5_context using
1429      *    hts_md5_init().  This pointer is then passed into one or more calls
1430      *    of hts_md5_update() to compute successive internal portions of the
1431      *    MD5 sum, which can then be externalised as a full 16-byte MD5sum
1432      *    calculation by calling hts_md5_final().  This can then be turned
1433      *    into ASCII via hts_md5_hex().
1434      *
1435      *    To dealloate any resources created by hts_md5_init() call the
1436      *    hts_md5_destroy() function.
1437      *
1438      *  @return     hts_md5_context pointer on success, NULL otherwise.
1439      */
1440     HTSLIB_EXPORT
1441     hts_md5_context *hts_md5_init(void);
1442 
1443     /*! @abstract Updates the context with the MD5 of the data. */
1444     HTSLIB_EXPORT
1445     void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size);
1446 
1447     /*! @abstract Computes the final 128-bit MD5 hash from the given context */
1448     HTSLIB_EXPORT
1449     void hts_md5_final(unsigned char *digest, hts_md5_context *ctx);
1450 
1451     /*! @abstract Resets an md5_context to the initial state, as returned
1452      *            by hts_md5_init().
1453      */
1454     HTSLIB_EXPORT
1455     void hts_md5_reset(hts_md5_context *ctx);
1456 
1457     /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
1458      *            hex string.
1459      */
1460     HTSLIB_EXPORT
1461     void hts_md5_hex(char *hex, const unsigned char *digest);
1462 
1463     /*! @abstract Deallocates any memory allocated by hts_md5_init. */
1464     HTSLIB_EXPORT
1465     void hts_md5_destroy(hts_md5_context *ctx);
1466 
hts_reg2bin(hts_pos_t beg,hts_pos_t end,int min_shift,int n_lvls)1467 static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls)
1468 {
1469     int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
1470     for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
1471         if (beg>>s == end>>s) return t + (beg>>s);
1472     return 0;
1473 }
1474 
1475 /// Compute the level of a bin in a binning index
hts_bin_level(int bin)1476 static inline int hts_bin_level(int bin) {
1477     int l, b;
1478     for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b));
1479     return l;
1480 }
1481 
1482 //! Compute the corresponding entry into the linear index of a given bin from
1483 //! a binning index
1484 /*!
1485  *  @param bin    The bin number
1486  *  @param n_lvls The index depth (number of levels - 0 based)
1487  *  @return       The integer offset into the linear index
1488  *
1489  *  Explanation of the return value formula:
1490  *  Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs.
1491  *  A linear index entry covers exp(2, min_shift) base pairs.
1492  */
hts_bin_bot(int bin,int n_lvls)1493 static inline int hts_bin_bot(int bin, int n_lvls)
1494 {
1495     int l = hts_bin_level(bin);
1496     return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
1497 }
1498 
1499 /**************
1500  * Endianness *
1501  **************/
1502 
ed_is_big(void)1503 static inline int ed_is_big(void)
1504 {
1505     long one= 1;
1506     return !(*((char *)(&one)));
1507 }
ed_swap_2(uint16_t v)1508 static inline uint16_t ed_swap_2(uint16_t v)
1509 {
1510     return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
1511 }
ed_swap_2p(void * x)1512 static inline void *ed_swap_2p(void *x)
1513 {
1514     *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
1515     return x;
1516 }
ed_swap_4(uint32_t v)1517 static inline uint32_t ed_swap_4(uint32_t v)
1518 {
1519     v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
1520     return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
1521 }
ed_swap_4p(void * x)1522 static inline void *ed_swap_4p(void *x)
1523 {
1524     *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
1525     return x;
1526 }
ed_swap_8(uint64_t v)1527 static inline uint64_t ed_swap_8(uint64_t v)
1528 {
1529     v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
1530     v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
1531     return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
1532 }
ed_swap_8p(void * x)1533 static inline void *ed_swap_8p(void *x)
1534 {
1535     *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
1536     return x;
1537 }
1538 
1539 #ifdef __cplusplus
1540 }
1541 #endif
1542 
1543 #endif
1544