1 /// @file htslib/hts.h
2 /// Format-neutral I/O, indexing, and iterator API functions.
3 /*
4 Copyright (C) 2012-2021 Genome Research Ltd.
5 Copyright (C) 2010, 2012 Broad Institute.
6 Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk>
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #ifndef HTSLIB_HTS_H
29 #define HTSLIB_HTS_H
30
31 #include <stddef.h>
32 #include <stdint.h>
33 #include <inttypes.h>
34
35 #include "hts_defs.h"
36 #include "hts_log.h"
37 #include "kstring.h"
38 #include "kroundup.h"
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 // Separator used to split HTS_PATH (for plugins); REF_PATH (cram references)
45 #if defined(_WIN32) || defined(__MSYS__)
46 #define HTS_PATH_SEPARATOR_CHAR ';'
47 #define HTS_PATH_SEPARATOR_STR ";"
48 #else
49 #define HTS_PATH_SEPARATOR_CHAR ':'
50 #define HTS_PATH_SEPARATOR_STR ":"
51 #endif
52
53 #ifndef HTS_BGZF_TYPEDEF
54 typedef struct BGZF BGZF;
55 #define HTS_BGZF_TYPEDEF
56 #endif
57 struct cram_fd;
58 struct hFILE;
59 struct hts_tpool;
60 struct sam_hdr_t;
61
62 /**
63 * @hideinitializer
64 * Deprecated macro to expand a dynamic array of a given type
65 *
66 * @param type_t The type of the array elements
67 * @param[in] n Requested number of elements of type type_t
68 * @param[in,out] m Size of memory allocated
69 * @param[in,out] ptr Pointer to the array
70 *
71 * @discussion
72 * Do not use this macro. Use hts_resize() instead as allows allocation
73 * failures to be handled more gracefully.
74 *
75 * The array *ptr will be expanded if necessary so that it can hold @p n
76 * or more elements. If the array is expanded then the new size will be
77 * written to @p m and the value in @p ptr may change.
78 *
79 * It must be possible to take the address of @p ptr and @p m must be usable
80 * as an lvalue.
81 *
82 * @bug
83 * If the memory allocation fails, this will call exit(1). This is
84 * not ideal behaviour in a library.
85 */
86 #define hts_expand(type_t, n, m, ptr) do { \
87 if ((n) > (m)) { \
88 size_t hts_realloc_or_die(size_t, size_t, size_t, size_t, \
89 int, void **, const char *); \
90 (m) = hts_realloc_or_die((n) >= 1 ? (n) : 1, (m), sizeof(m), \
91 sizeof(type_t), 0, \
92 (void **)&(ptr), __func__); \
93 } \
94 } while (0)
95
96 /**
97 * @hideinitializer
98 * Macro to expand a dynamic array, zeroing any newly-allocated memory
99 *
100 * @param type_t The type of the array elements
101 * @param[in] n Requested number of elements of type type_t
102 * @param[in,out] m Size of memory allocated
103 * @param[in,out] ptr Pointer to the array
104 *
105 * @discussion
106 * Do not use this macro. Use hts_resize() instead as allows allocation
107 * failures to be handled more gracefully.
108 *
109 * As for hts_expand(), except the bytes that make up the array elements
110 * between the old and new values of @p m are set to zero using memset().
111 *
112 * @bug
113 * If the memory allocation fails, this will call exit(1). This is
114 * not ideal behaviour in a library.
115 */
116
117
118 #define hts_expand0(type_t, n, m, ptr) do { \
119 if ((n) > (m)) { \
120 size_t hts_realloc_or_die(size_t, size_t, size_t, size_t, \
121 int, void **, const char *); \
122 (m) = hts_realloc_or_die((n) >= 1 ? (n) : 1, (m), sizeof(m), \
123 sizeof(type_t), 1, \
124 (void **)&(ptr), __func__); \
125 } \
126 } while (0)
127
128 // For internal use (by hts_resize()) only
129 HTSLIB_EXPORT
130 int hts_resize_array_(size_t, size_t, size_t, void *, void **, int,
131 const char *);
132
133 #define HTS_RESIZE_CLEAR 1
134
135 /**
136 * @hideinitializer
137 * Macro to expand a dynamic array of a given type
138 *
139 * @param type_t The type of the array elements
140 * @param[in] num Requested number of elements of type type_t
141 * @param[in,out] size_ptr Pointer to where the size (in elements) of the
142 array is stored.
143 * @param[in,out] ptr Location of the pointer to the array
144 * @param[in] flags Option flags
145 *
146 * @return 0 for success, or negative if an error occurred.
147 *
148 * @discussion
149 * The array *ptr will be expanded if necessary so that it can hold @p num
150 * or more elements. If the array is expanded then the new size will be
151 * written to @p *size_ptr and the value in @p *ptr may change.
152 *
153 * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will
154 * be cleared.
155 */
156
157 #define hts_resize(type_t, num, size_ptr, ptr, flags) \
158 ((num) > (*(size_ptr)) \
159 ? hts_resize_array_(sizeof(type_t), (num), \
160 sizeof(*(size_ptr)), (size_ptr), \
161 (void **)(ptr), (flags), __func__) \
162 : 0)
163
164 /// Release resources when dlclosing a dynamically loaded HTSlib
165 /** @discussion
166 * Normally HTSlib cleans up automatically when your program exits,
167 * whether that is via exit(3) or returning from main(). However if you
168 * have dlopen(3)ed HTSlib and wish to close it before your main program
169 * exits, you must call hts_lib_shutdown() before dlclose(3).
170 */
171 HTSLIB_EXPORT
172 void hts_lib_shutdown(void);
173
174 /**
175 * Wrapper function for free(). Enables memory deallocation across DLL
176 * boundary. Should be used by all applications, which are compiled
177 * with a different standard library than htslib and call htslib
178 * methods that return dynamically allocated data.
179 */
180 HTSLIB_EXPORT
181 void hts_free(void *ptr);
182
183 /************
184 * File I/O *
185 ************/
186
187 // Add new entries only at the end (but before the *_maximum entry)
188 // of these enums, as their numbering is part of the htslib ABI.
189
190 enum htsFormatCategory {
191 unknown_category,
192 sequence_data, // Sequence data -- SAM, BAM, CRAM, etc
193 variant_data, // Variant calling data -- VCF, BCF, etc
194 index_file, // Index file associated with some data file
195 region_list, // Coordinate intervals or regions -- BED, etc
196 category_maximum = 32767
197 };
198
199 enum htsExactFormat {
200 unknown_format,
201 binary_format, text_format,
202 sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
203 htsget,
204 json HTS_DEPRECATED_ENUM("Use htsExactFormat 'htsget' instead") = htsget,
205 empty_format, // File is empty (or empty after decompression)
206 fasta_format, fastq_format, fai_format, fqi_format,
207 hts_crypt4gh_format,
208 d4_format,
209 format_maximum = 32767
210 };
211
212 enum htsCompression {
213 no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression,
214 xz_compression, zstd_compression,
215 compression_maximum = 32767
216 };
217
218 typedef struct htsFormat {
219 enum htsFormatCategory category;
220 enum htsExactFormat format;
221 struct { short major, minor; } version;
222 enum htsCompression compression;
223 short compression_level; // currently unused
224 void *specific; // format specific options; see struct hts_opt.
225 } htsFormat;
226
227 struct hts_idx_t;
228 typedef struct hts_idx_t hts_idx_t;
229 struct hts_filter_t;
230
231 /**
232 * @brief File handle returned by hts_open() etc.
233 * This structure should be considered opaque by end users. There should be
234 * no need to access most fields directly in user code, and in cases where
235 * it is desirable accessor functions such as hts_get_format() are provided.
236 */
237 // Maintainers note htsFile cannot be an incomplete struct because some of its
238 // fields are part of libhts.so's ABI (hence these fields must not be moved):
239 // - fp is used in the public sam_itr_next()/etc macros
240 // - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
241 // - is_write and is_cram are used directly in samtools <= 1.1
242 // - fp is used directly in samtools (up to and including current develop)
243 // - line is used directly in bcftools (up to and including current develop)
244 // - is_bgzf and is_cram flags indicate which fp union member to use.
245 // Note is_bgzf being set does not indicate the flag is BGZF compressed,
246 // nor even whether it is compressed at all (eg on naked BAMs).
247 typedef struct htsFile {
248 uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_bgzf:1, dummy:27;
249 int64_t lineno;
250 kstring_t line;
251 char *fn, *fn_aux;
252 union {
253 BGZF *bgzf;
254 struct cram_fd *cram;
255 struct hFILE *hfile;
256 } fp;
257 void *state; // format specific state information
258 htsFormat format;
259 hts_idx_t *idx;
260 const char *fnidx;
261 struct sam_hdr_t *bam_header;
262 struct hts_filter_t *filter;
263 } htsFile;
264
265 // A combined thread pool and queue allocation size.
266 // The pool should already be defined, but qsize may be zero to
267 // indicate an appropriate queue size is taken from the pool.
268 //
269 // Reasons for explicitly setting it could be where many more file
270 // descriptors are in use than threads, so keeping memory low is
271 // important.
272 typedef struct htsThreadPool {
273 struct hts_tpool *pool; // The shared thread pool itself
274 int qsize; // Size of I/O queue to use for this fp
275 } htsThreadPool;
276
277 // REQUIRED_FIELDS
278 enum sam_fields {
279 SAM_QNAME = 0x00000001,
280 SAM_FLAG = 0x00000002,
281 SAM_RNAME = 0x00000004,
282 SAM_POS = 0x00000008,
283 SAM_MAPQ = 0x00000010,
284 SAM_CIGAR = 0x00000020,
285 SAM_RNEXT = 0x00000040,
286 SAM_PNEXT = 0x00000080,
287 SAM_TLEN = 0x00000100,
288 SAM_SEQ = 0x00000200,
289 SAM_QUAL = 0x00000400,
290 SAM_AUX = 0x00000800,
291 SAM_RGAUX = 0x00001000,
292 };
293
294 // Mostly CRAM only, but this could also include other format options
295 enum hts_fmt_option {
296 // CRAM specific
297 CRAM_OPT_DECODE_MD,
298 CRAM_OPT_PREFIX,
299 CRAM_OPT_VERBOSITY, // obsolete, use hts_set_log_level() instead
300 CRAM_OPT_SEQS_PER_SLICE,
301 CRAM_OPT_SLICES_PER_CONTAINER,
302 CRAM_OPT_RANGE,
303 CRAM_OPT_VERSION, // rename to cram_version?
304 CRAM_OPT_EMBED_REF,
305 CRAM_OPT_IGNORE_MD5,
306 CRAM_OPT_REFERENCE, // make general
307 CRAM_OPT_MULTI_SEQ_PER_SLICE,
308 CRAM_OPT_NO_REF,
309 CRAM_OPT_USE_BZIP2,
310 CRAM_OPT_SHARED_REF,
311 CRAM_OPT_NTHREADS, // deprecated, use HTS_OPT_NTHREADS
312 CRAM_OPT_THREAD_POOL,// make general
313 CRAM_OPT_USE_LZMA,
314 CRAM_OPT_USE_RANS,
315 CRAM_OPT_REQUIRED_FIELDS,
316 CRAM_OPT_LOSSY_NAMES,
317 CRAM_OPT_BASES_PER_SLICE,
318 CRAM_OPT_STORE_MD,
319 CRAM_OPT_STORE_NM,
320 CRAM_OPT_RANGE_NOSEEK, // CRAM_OPT_RANGE minus the seek
321 CRAM_OPT_USE_TOK,
322 CRAM_OPT_USE_FQZ,
323 CRAM_OPT_USE_ARITH,
324 CRAM_OPT_POS_DELTA, // force delta for AP, even on non-pos sorted data
325
326 // General purpose
327 HTS_OPT_COMPRESSION_LEVEL = 100,
328 HTS_OPT_NTHREADS,
329 HTS_OPT_THREAD_POOL,
330 HTS_OPT_CACHE_SIZE,
331 HTS_OPT_BLOCK_SIZE,
332 HTS_OPT_FILTER,
333 HTS_OPT_PROFILE,
334
335 // Fastq
336
337 // Boolean.
338 // Read / Write CASAVA 1.8 format.
339 // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf
340 //
341 // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+
342 // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag,
343 // \d+ is a control number, and the sequence at the end is
344 // for barcode sequence. Barcodes are read into the aux tag defined
345 // by FASTQ_OPT_BARCODE ("BC" by default).
346 FASTQ_OPT_CASAVA = 1000,
347
348 // String.
349 // Whether to read / write extra SAM format aux tags from the fastq
350 // identifier line. For reading this can simply be "1" to request
351 // decoding aux tags. For writing it is a comma separated list of aux
352 // tag types to be written out.
353 FASTQ_OPT_AUX,
354
355 // Boolean.
356 // Whether to add /1 and /2 to read identifiers when writing FASTQ.
357 // These come from the BAM_FREAD1 or BAM_FREAD2 flags.
358 // (Detecting the /1 and /2 is automatic when reading fastq.)
359 FASTQ_OPT_RNUM,
360
361 // Two character string.
362 // Barcode aux tag for CASAVA; defaults to "BC".
363 FASTQ_OPT_BARCODE,
364
365 // Process SRA and ENA read names which pointlessly move the original
366 // name to the second field and insert a constructed <run>.<number>
367 // name in its place.
368 FASTQ_OPT_NAME2,
369 };
370
371 // Profile options for encoding; primarily used at present in CRAM
372 // but also usable in BAM as a synonym for deflate compression levels.
373 enum hts_profile_option {
374 HTS_PROFILE_FAST,
375 HTS_PROFILE_NORMAL,
376 HTS_PROFILE_SMALL,
377 HTS_PROFILE_ARCHIVE,
378 };
379
380 // For backwards compatibility
381 #define cram_option hts_fmt_option
382
383 typedef struct hts_opt {
384 char *arg; // string form, strdup()ed
385 enum hts_fmt_option opt; // tokenised key
386 union { // ... and value
387 int i;
388 char *s;
389 } val;
390 struct hts_opt *next;
391 } hts_opt;
392
393 #define HTS_FILE_OPTS_INIT {{0},0}
394
395 /*
396 * Explicit index file name delimiter, see below
397 */
398 #define HTS_IDX_DELIM "##idx##"
399
400
401 /**********************
402 * Exported functions *
403 **********************/
404
405 /*
406 * Parses arg and appends it to the option list.
407 *
408 * Returns 0 on success;
409 * -1 on failure.
410 */
411 HTSLIB_EXPORT
412 int hts_opt_add(hts_opt **opts, const char *c_arg);
413
414 /*
415 * Applies an hts_opt option list to a given htsFile.
416 *
417 * Returns 0 on success
418 * -1 on failure
419 */
420 HTSLIB_EXPORT
421 int hts_opt_apply(htsFile *fp, hts_opt *opts);
422
423 /*
424 * Frees an hts_opt list.
425 */
426 HTSLIB_EXPORT
427 void hts_opt_free(hts_opt *opts);
428
429 /*
430 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
431 * followed by a comma separated list of key=value options and splits
432 * these up into the fields of htsFormat struct.
433 *
434 * Returns 0 on success
435 * -1 on failure.
436 */
437 HTSLIB_EXPORT
438 int hts_parse_format(htsFormat *opt, const char *str);
439
440 /*
441 * Tokenise options as (key(=value)?,)*(key(=value)?)?
442 * NB: No provision for ',' appearing in the value!
443 * Add backslashing rules?
444 *
445 * This could be used as part of a general command line option parser or
446 * as a string concatenated onto the file open mode.
447 *
448 * Returns 0 on success
449 * -1 on failure.
450 */
451 HTSLIB_EXPORT
452 int hts_parse_opt_list(htsFormat *opt, const char *str);
453
454 /*! @abstract Table for converting a nucleotide character to 4-bit encoding.
455 The input character may be either an IUPAC ambiguity code, '=' for 0, or
456 '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
457 for A/C/G/T or combinations of these bits for ambiguous bases.
458 */
459 extern const unsigned char seq_nt16_table[256];
460
461 /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
462 ambiguity code letter (or '=' when given 0).
463 */
464 extern const char seq_nt16_str[];
465
466 /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
467 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
468 */
469 extern const int seq_nt16_int[];
470
471 /*!
472 @abstract Get the htslib version number
473 @return For released versions, a string like "N.N[.N]"; or git describe
474 output if using a library built within a Git repository.
475 */
476 HTSLIB_EXPORT
477 const char *hts_version(void);
478
479 /*!
480 @abstract Compile-time HTSlib version number, for use in #if checks
481 @return For released versions X.Y[.Z], an integer of the form XYYYZZ;
482 useful for preprocessor conditionals such as
483 #if HTS_VERSION >= 101000 // Check for v1.10 or later
484 */
485 // Maintainers: Bump this in the final stage of preparing a new release.
486 // Immediately after release, bump ZZ to 90 to distinguish in-development
487 // Git repository builds from the release; you may wish to increment this
488 // further when significant features are merged.
489 #define HTS_VERSION 101400
490
491 /*! @abstract Introspection on the features enabled in htslib
492 *
493 * @return a bitfield of HTS_FEATURE_* macros.
494 */
495 HTSLIB_EXPORT
496 unsigned int hts_features(void);
497
498 HTSLIB_EXPORT
499 const char *hts_test_feature(unsigned int id);
500
501 /*! @abstract Introspection on the features enabled in htslib, string form
502 *
503 * @return a string describing htslib build features
504 */
505 HTSLIB_EXPORT
506 const char *hts_feature_string(void);
507
508 // Whether ./configure was used or vanilla Makefile
509 #define HTS_FEATURE_CONFIGURE 1
510
511 // Whether --enable-plugins was used
512 #define HTS_FEATURE_PLUGINS 2
513
514 // Transport specific
515 #define HTS_FEATURE_LIBCURL (1u<<10)
516 #define HTS_FEATURE_S3 (1u<<11)
517 #define HTS_FEATURE_GCS (1u<<12)
518
519 // Compression options
520 #define HTS_FEATURE_LIBDEFLATE (1u<<20)
521 #define HTS_FEATURE_LZMA (1u<<21)
522 #define HTS_FEATURE_BZIP2 (1u<<22)
523 #define HTS_FEATURE_HTSCODECS (1u<<23) // htscodecs library version
524
525 // Build params
526 #define HTS_FEATURE_CC (1u<<27)
527 #define HTS_FEATURE_CFLAGS (1u<<28)
528 #define HTS_FEATURE_CPPFLAGS (1u<<29)
529 #define HTS_FEATURE_LDFLAGS (1u<<30)
530
531
532 /*!
533 @abstract Determine format by peeking at the start of a file
534 @param fp File opened for reading, positioned at the beginning
535 @param fmt Format structure that will be filled out on return
536 @return 0 for success, or negative if an error occurred.
537 */
538 HTSLIB_EXPORT
539 int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
540
541 /*!
542 @abstract Get a human-readable description of the file format
543 @param fmt Format structure holding type, version, compression, etc.
544 @return Description string, to be freed by the caller after use.
545 */
546 HTSLIB_EXPORT
547 char *hts_format_description(const htsFormat *format);
548
549 /*!
550 @abstract Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF)
551 or possibly-compressed textual line-orientated file
552 @param fn The file name or "-" for stdin/stdout. For indexed files
553 with a non-standard naming, the file name can include the
554 name of the index file delimited with HTS_IDX_DELIM
555 @param mode Mode matching / [rwa][bcefFguxz0-9]* /
556 @discussion
557 With 'r' opens for reading; any further format mode letters are ignored
558 as the format is detected by checking the first few bytes or BGZF blocks
559 of the file. With 'w' or 'a' opens for writing or appending, with format
560 specifier letters:
561 b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
562 c CRAM format
563 f FASTQ format
564 F FASTA format
565 g gzip compressed
566 u uncompressed
567 z bgzf compressed
568 [0-9] zlib compression level
569 and with non-format option letters (for any of 'r'/'w'/'a'):
570 e close the file on exec(2) (opens with O_CLOEXEC, where supported)
571 x create the file exclusively (opens with O_EXCL, where supported)
572 Note that there is a distinction between 'u' and '0': the first yields
573 plain uncompressed output whereas the latter outputs uncompressed data
574 wrapped in the zlib format.
575 @example
576 [rw]b .. compressed BCF, BAM, FAI
577 [rw]bu .. uncompressed BCF
578 [rw]z .. compressed VCF
579 [rw] .. uncompressed VCF
580 */
581 HTSLIB_EXPORT
582 htsFile *hts_open(const char *fn, const char *mode);
583
584 /*!
585 @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
586 @param fn The file name or "-" for stdin/stdout
587 @param mode Open mode, as per hts_open()
588 @param fmt Optional format specific parameters
589 @discussion
590 See hts_open() for description of fn and mode.
591 // TODO Update documentation for s/opts/fmt/
592 Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
593 if defined, override mode. Opts also contains a linked list of hts_opt
594 structures to apply to the open file handle. These can contain things
595 like pointers to the reference or information on compression levels,
596 block sizes, etc.
597 */
598 HTSLIB_EXPORT
599 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt);
600
601 /*!
602 @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
603 @param fn The already-open file handle
604 @param mode Open mode, as per hts_open()
605 */
606 HTSLIB_EXPORT
607 htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
608
609 /*!
610 @abstract For output streams, flush any buffered data
611 @param fp The file handle to be flushed
612 @return 0 for success, or negative if an error occurred.
613 @since 1.14
614 */
615 HTSLIB_EXPORT
616 int hts_flush(htsFile *fp);
617
618 /*!
619 @abstract Close a file handle, flushing buffered data for output streams
620 @param fp The file handle to be closed
621 @return 0 for success, or negative if an error occurred.
622 */
623 HTSLIB_EXPORT
624 int hts_close(htsFile *fp);
625
626 /*!
627 @abstract Returns the file's format information
628 @param fp The file handle
629 @return Read-only pointer to the file's htsFormat.
630 */
631 HTSLIB_EXPORT
632 const htsFormat *hts_get_format(htsFile *fp);
633
634 /*!
635 @ abstract Returns a string containing the file format extension.
636 @ param format Format structure containing the file type.
637 @ return A string ("sam", "bam", etc) or "?" for unknown formats.
638 */
639 HTSLIB_EXPORT
640 const char *hts_format_file_extension(const htsFormat *format);
641
642 /*!
643 @abstract Sets a specified CRAM option on the open file handle.
644 @param fp The file handle open the open file.
645 @param opt The CRAM_OPT_* option.
646 @param ... Optional arguments, dependent on the option used.
647 @return 0 for success, or negative if an error occurred.
648 */
649 HTSLIB_EXPORT
650 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...);
651
652 /*!
653 @abstract Read a line (and its \n or \r\n terminator) from a file
654 @param fp The file handle
655 @param delimiter Unused, but must be '\n' (or KS_SEP_LINE)
656 @param str The line (not including the terminator) is written here
657 @return Length of the string read;
658 -1 on end-of-file; <= -2 on error
659 */
660 HTSLIB_EXPORT
661 int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
662
663 HTSLIB_EXPORT
664 char **hts_readlines(const char *fn, int *_n);
665 /*!
666 @abstract Parse comma-separated list or read list from a file
667 @param list File name or comma-separated list
668 @param is_file
669 @param _n Size of the output array (number of items read)
670 @return NULL on failure or pointer to newly allocated array of
671 strings
672 */
673 HTSLIB_EXPORT
674 char **hts_readlist(const char *fn, int is_file, int *_n);
675
676 /*!
677 @abstract Create extra threads to aid compress/decompression for this file
678 @param fp The file handle
679 @param n The number of worker threads to create
680 @return 0 for success, or negative if an error occurred.
681 @notes This function creates non-shared threads for use solely by fp.
682 The hts_set_thread_pool function is the recommended alternative.
683 */
684 HTSLIB_EXPORT
685 int hts_set_threads(htsFile *fp, int n);
686
687 /*!
688 @abstract Create extra threads to aid compress/decompression for this file
689 @param fp The file handle
690 @param p A pool of worker threads, previously allocated by hts_create_threads().
691 @return 0 for success, or negative if an error occurred.
692 */
693 HTSLIB_EXPORT
694 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p);
695
696 /*!
697 @abstract Adds a cache of decompressed blocks, potentially speeding up seeks.
698 This may not work for all file types (currently it is bgzf only).
699 @param fp The file handle
700 @param n The size of cache, in bytes
701 */
702 HTSLIB_EXPORT
703 void hts_set_cache_size(htsFile *fp, int n);
704
705 /*!
706 @abstract Set .fai filename for a file opened for reading
707 @return 0 for success, negative on failure
708 @discussion
709 Called before *_hdr_read(), this provides the name of a .fai file
710 used to provide a reference list if the htsFile contains no @SQ headers.
711 */
712 HTSLIB_EXPORT
713 int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
714
715
716 /*!
717 @abstract Sets a filter expression
718 @return 0 for success, negative on failure
719 @discussion
720 To clear an existing filter, specifying expr as NULL.
721 */
722 HTSLIB_EXPORT
723 int hts_set_filter_expression(htsFile *fp, const char *expr);
724
725 /*!
726 @abstract Determine whether a given htsFile contains a valid EOF block
727 @return 3 for a non-EOF checkable filetype;
728 2 for an unseekable file type where EOF cannot be checked;
729 1 for a valid EOF block;
730 0 for if the EOF marker is absent when it should be present;
731 -1 (with errno set) on failure
732 @discussion
733 Check if the BGZF end-of-file (EOF) marker is present
734 */
735 HTSLIB_EXPORT
736 int hts_check_EOF(htsFile *fp);
737
738 /************
739 * Indexing *
740 ************/
741
742 /*!
743 These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
744 producing iterators operating as follows:
745 - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
746 - HTS_IDX_START iterates over the entire file
747 - HTS_IDX_REST iterates from the current position to the end of the file
748 - HTS_IDX_NONE always returns "no more alignment records"
749 When one of these special tid values is used, beg and end are ignored.
750 When REST or NONE is used, idx is also ignored and may be NULL.
751 */
752 #define HTS_IDX_NOCOOR (-2)
753 #define HTS_IDX_START (-3)
754 #define HTS_IDX_REST (-4)
755 #define HTS_IDX_NONE (-5)
756
757 #define HTS_FMT_CSI 0
758 #define HTS_FMT_BAI 1
759 #define HTS_FMT_TBI 2
760 #define HTS_FMT_CRAI 3
761 #define HTS_FMT_FAI 4
762
763 // Almost INT64_MAX, but when cast into a 32-bit int it's
764 // also INT_MAX instead of -1. This avoids bugs with old code
765 // using the new hts_pos_t data type.
766 #define HTS_POS_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX)
767 #define HTS_POS_MIN INT64_MIN
768 #define PRIhts_pos PRId64
769 typedef int64_t hts_pos_t;
770
771 // For comparison with previous release:
772 //
773 // #define HTS_POS_MAX INT_MAX
774 // #define HTS_POS_MIN INT_MIN
775 // #define PRIhts_pos PRId32
776 // typedef int32_t hts_pos_t;
777
778 typedef struct hts_pair_pos_t {
779 hts_pos_t beg, end;
780 } hts_pair_pos_t;
781
782 typedef hts_pair_pos_t hts_pair32_t; // For backwards compatibility
783
784 typedef struct hts_pair64_t {
785 uint64_t u, v;
786 } hts_pair64_t;
787
788 typedef struct hts_pair64_max_t {
789 uint64_t u, v;
790 uint64_t max;
791 } hts_pair64_max_t;
792
793 typedef struct hts_reglist_t {
794 const char *reg;
795 hts_pair_pos_t *intervals;
796 int tid;
797 uint32_t count;
798 hts_pos_t min_beg, max_end;
799 } hts_reglist_t;
800
801 typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end);
802 typedef int hts_seek_func(void *fp, int64_t offset, int where);
803 typedef int64_t hts_tell_func(void *fp);
804
805 /**
806 * @brief File iterator that can handle multiple target regions.
807 * This structure should be considered opaque by end users.
808 * It does both the stepping inside the file and the filtering of alignments.
809 * It can operate in single or multi-region mode, and depending on this,
810 * it uses different fields.
811 *
812 * read_rest (1) - read everything from the current offset, without filtering
813 * finished (1) - no more iterations
814 * is_cram (1) - current file has CRAM format
815 * nocoor (1) - read all unmapped reads
816 *
817 * multi (1) - multi-region moode
818 * reg_list - List of target regions
819 * n_reg - Size of the above list
820 * curr_reg - List index of the current region of search
821 * curr_intv - Interval index inside the current region; points to a (beg, end)
822 * end - Used for CRAM files, to preserve the max end coordinate
823 *
824 * multi (0) - single-region mode
825 * tid - Reference id of the target region
826 * beg - Start position of the target region
827 * end - End position of the target region
828 *
829 * Common fields:
830 * off - List of file offsets computed from the index
831 * n_off - Size of the above list
832 * i - List index of the current file offset
833 * curr_off - File offset for the next file read
834 * curr_tid - Reference id of the current alignment
835 * curr_beg - Start position of the current alignment
836 * curr_end - End position of the current alignment
837 * nocoor_off - File offset where the unmapped reads start
838 *
839 * readrec - File specific function that reads an alignment
840 * seek - File specific function for changing the file offset
841 * tell - File specific function for indicating the file offset
842 */
843
844 typedef struct hts_itr_t {
845 uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27;
846 int tid, n_off, i, n_reg;
847 hts_pos_t beg, end;
848 hts_reglist_t *reg_list;
849 int curr_tid, curr_reg, curr_intv;
850 hts_pos_t curr_beg, curr_end;
851 uint64_t curr_off, nocoor_off;
852 hts_pair64_max_t *off;
853 hts_readrec_func *readrec;
854 hts_seek_func *seek;
855 hts_tell_func *tell;
856 struct {
857 int n, m;
858 int *a;
859 } bins;
860 } hts_itr_t;
861
862 typedef hts_itr_t hts_itr_multi_t;
863
864 /// Compute the first bin on a given level
865 #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
866 /// Compute the parent bin of a given bin
867 #define hts_bin_parent(b) (((b) - 1) >> 3)
868
869 ///////////////////////////////////////////////////////////
870 // Low-level API for building indexes.
871
872 /// Create a BAI/CSI/TBI type index structure
873 /** @param n Initial number of targets
874 @param fmt Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
875 @param offset0 Initial file offset
876 @param min_shift Number of bits for the minimal interval
877 @param n_lvls Number of levels in the binning index
878 @return An initialised hts_idx_t struct on success; NULL on failure
879
880 The struct returned by a successful call should be freed via hts_idx_destroy()
881 when it is no longer needed.
882 */
883 HTSLIB_EXPORT
884 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
885
886 /// Free a BAI/CSI/TBI type index
887 /** @param idx Index structure to free
888 */
889 HTSLIB_EXPORT
890 void hts_idx_destroy(hts_idx_t *idx);
891
892 /// Push an index entry
893 /** @param idx Index
894 @param tid Target id
895 @param beg Range start (zero-based)
896 @param end Range end (zero-based, half-open)
897 @param offset File offset
898 @param is_mapped Range corresponds to a mapped read
899 @return 0 on success; -1 on failure
900
901 The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts
902 stored in the meta-data bin.
903 */
904 HTSLIB_EXPORT
905 int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);
906
907 /// Finish building an index
908 /** @param idx Index
909 @param final_offset Last file offset
910 @return 0 on success; non-zero on failure.
911 */
912 HTSLIB_EXPORT
913 int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
914
915 /// Returns index format
916 /** @param idx Index
917 @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
918 */
919 HTSLIB_EXPORT
920 int hts_idx_fmt(hts_idx_t *idx);
921
922 /// Add name to TBI index meta-data
923 /** @param idx Index
924 @param tid Target identifier
925 @param name Target name
926 @return Index number of name in names list on success; -1 on failure.
927 */
928 HTSLIB_EXPORT
929 int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name);
930
931 // Index loading and saving
932
933 /// Save an index to a file
934 /** @param idx Index to be written
935 @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
936 @param fmt One of the HTS_FMT_* index formats
937 @return 0 if successful, or negative if an error occurred.
938 */
939 HTSLIB_EXPORT
940 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) HTS_RESULT_USED;
941
942 /// Save an index to a specific file
943 /** @param idx Index to be written
944 @param fn Input BAM/BCF/etc filename
945 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
946 @param fmt One of the HTS_FMT_* index formats
947 @return 0 if successful, or negative if an error occurred.
948 */
949 HTSLIB_EXPORT
950 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) HTS_RESULT_USED;
951
952 /// Load an index file
953 /** @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
954 the extension substituted, to search for an existing index file.
955 In case of a non-standard naming, the file name can include the
956 name of the index file delimited with HTS_IDX_DELIM.
957 @param fmt One of the HTS_FMT_* index formats
958 @return The index, or NULL if an error occurred.
959
960 If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before
961 the delimiter will be used as the name of the data file and the part after
962 it will be used as the name of the index.
963
964 Otherwise, this function tries to work out the index name as follows:
965
966 It will try appending ".csi" to @p fn
967 It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi"
968 Then, if @p fmt is HTS_FMT_BAI:
969 It will try appending ".bai" to @p fn
970 To will substituting the existing suffix (e.g. .bam) with ".bai"
971 else if @p fmt is HTS_FMT_TBI:
972 It will try appending ".tbi" to @p fn
973 To will substituting the existing suffix (e.g. .vcf) with ".tbi"
974
975 If the index file is remote (served over a protocol like https), first a check
976 is made to see is a locally cached copy is available. This is done for all
977 of the possible names listed above. If a cached copy is not available then
978 the index will be downloaded and stored in the current working directory,
979 with the same name as the remote index.
980
981 Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE);
982 */
983 HTSLIB_EXPORT
984 hts_idx_t *hts_idx_load(const char *fn, int fmt);
985
986 /// Load a specific index file
987 /** @param fn Input BAM/BCF/etc filename
988 @param fnidx The input index filename
989 @return The index, or NULL if an error occurred.
990
991 Equivalent to hts_idx_load3(fn, fnidx, 0, 0);
992
993 This function will not attempt to save index files locally.
994 */
995 HTSLIB_EXPORT
996 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx);
997
998 /// Load a specific index file
999 /** @param fn Input BAM/BCF/etc filename
1000 @param fnidx The input index filename
1001 @param fmt One of the HTS_FMT_* index formats
1002 @param flags Flags to alter behaviour (see description)
1003 @return The index, or NULL if an error occurred.
1004
1005 If @p fnidx is NULL, the index name will be derived from @p fn in the
1006 same way as hts_idx_load().
1007
1008 If @p fnidx is not NULL, @p fmt is ignored.
1009
1010 The @p flags parameter can be set to a combination of the following
1011 values:
1012
1013 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes
1014 HTS_IDX_SILENT_FAIL Fail silently if the index is not present
1015
1016 The index struct returned by a successful call should be freed
1017 via hts_idx_destroy() when it is no longer needed.
1018 */
1019 HTSLIB_EXPORT
1020 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags);
1021
1022 /// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() )
1023 #define HTS_IDX_SAVE_REMOTE 1
1024 #define HTS_IDX_SILENT_FAIL 2
1025
1026 ///////////////////////////////////////////////////////////
1027 // Functions for accessing meta-data stored in indexes
1028
1029 typedef const char *(*hts_id2name_f)(void*, int);
1030
1031 /// Get extra index meta-data
1032 /** @param idx The index
1033 @param l_meta Pointer to where the length of the extra data is stored
1034 @return Pointer to the extra data if present; NULL otherwise
1035
1036 Indexes (both .tbi and .csi) made by tabix include extra data about
1037 the indexed file. The returns a pointer to this data. Note that the
1038 data is stored exactly as it is in the index. Callers need to interpret
1039 the results themselves, including knowing what sort of data to expect;
1040 byte swapping etc.
1041 */
1042 HTSLIB_EXPORT
1043 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta);
1044
1045 /// Set extra index meta-data
1046 /** @param idx The index
1047 @param l_meta Length of data
1048 @param meta Pointer to the extra data
1049 @param is_copy If not zero, a copy of the data is taken
1050 @return 0 on success; -1 on failure (out of memory).
1051
1052 Sets the data that is returned by hts_idx_get_meta().
1053
1054 If is_copy != 0, a copy of the input data is taken. If not, ownership of
1055 the data pointed to by *meta passes to the index.
1056 */
1057 HTSLIB_EXPORT
1058 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy);
1059
1060 /// Get number of mapped and unmapped reads from an index
1061 /** @param idx Index
1062 @param tid Target ID
1063 @param[out] mapped Location to store number of mapped reads
1064 @param[out] unmapped Location to store number of unmapped reads
1065 @return 0 on success; -1 on failure (data not available)
1066
1067 BAI and CSI indexes store information on the number of reads for each
1068 target that were mapped or unmapped (unmapped reads will generally have
1069 a paired read that is mapped to the target). This function returns this
1070 information if it is available.
1071
1072 @note Cram CRAI indexes do not include this information.
1073 */
1074 HTSLIB_EXPORT
1075 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
1076
1077 /// Return the number of unplaced reads from an index
1078 /** @param idx Index
1079 @return Unplaced reads count
1080
1081 Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM
1082 files).
1083 */
1084 HTSLIB_EXPORT
1085 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
1086
1087 /// Return a list of target names from an index
1088 /** @param idx Index
1089 @param[out] n Location to store the number of targets
1090 @param getid Callback function to get the name for a target ID
1091 @param hdr Header from indexed file
1092 @return An array of pointers to the names on success; NULL on failure
1093
1094 @note The names are pointers into the header data structure. When cleaning
1095 up, only the array should be freed, not the names.
1096 */
1097 HTSLIB_EXPORT
1098 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
1099
1100 /// Return the number of targets from an index
1101 /** @param idx Index
1102 @return The number of targets
1103 */
1104 HTSLIB_EXPORT
1105 int hts_idx_nseq(const hts_idx_t *idx);
1106
1107 ///////////////////////////////////////////////////////////
1108 // Region parsing
1109
1110 #define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers
1111 #define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end
1112 #define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP)
1113
1114 /// Parse a numeric string
1115 /** The number may be expressed in scientific notation, and optionally may
1116 contain commas in the integer part (before any decimal point or E notation).
1117 @param str String to be parsed
1118 @param strend If non-NULL, set on return to point to the first character
1119 in @a str after those forming the parsed number
1120 @param flags Or'ed-together combination of HTS_PARSE_* flags
1121 @return Converted value of the parsed number.
1122
1123 When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING
1124 or more) if there are any trailing characters after the number.
1125 */
1126 HTSLIB_EXPORT
1127 long long hts_parse_decimal(const char *str, char **strend, int flags);
1128
1129 typedef int (*hts_name2id_f)(void*, const char*);
1130
1131 /// Parse a "CHR:START-END"-style region string
1132 /** @param str String to be parsed
1133 @param beg Set on return to the 0-based start of the region
1134 @param end Set on return to the 1-based end of the region
1135 @return Pointer to the colon or '\0' after the reference sequence name,
1136 or NULL if @a str could not be parsed.
1137
1138 NOTE: For compatibility with hts_parse_reg only.
1139 Please use hts_parse_region instead.
1140 */
1141 HTSLIB_EXPORT
1142 const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end);
1143
1144 /// Parse a "CHR:START-END"-style region string
1145 /** @param str String to be parsed
1146 @param beg Set on return to the 0-based start of the region
1147 @param end Set on return to the 1-based end of the region
1148 @return Pointer to the colon or '\0' after the reference sequence name,
1149 or NULL if @a str could not be parsed.
1150 */
1151 HTSLIB_EXPORT
1152 const char *hts_parse_reg(const char *str, int *beg, int *end);
1153
1154 /// Parse a "CHR:START-END"-style region string
1155 /** @param str String to be parsed
1156 @param tid Set on return (if not NULL) to be reference index (-1 if invalid)
1157 @param beg Set on return to the 0-based start of the region
1158 @param end Set on return to the 1-based end of the region
1159 @param getid Function pointer. Called if not NULL to set tid.
1160 @param hdr Caller data passed to getid.
1161 @param flags Bitwise HTS_PARSE_* flags listed above.
1162 @return Pointer to the byte after the end of the entire region
1163 specifier (including any trailing comma) on success,
1164 or NULL if @a str could not be parsed.
1165
1166 A variant of hts_parse_reg which is reference-id aware. It uses
1167 the iterator name2id callbacks to validate the region tokenisation works.
1168
1169 This is necessary due to GRCh38 HLA additions which have reference names
1170 like "HLA-DRB1*12:17".
1171
1172 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
1173 are reference names, quote using curly braces.
1174 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
1175
1176 Flags are used to control how parsing works, and can be one of the below.
1177
1178 HTS_PARSE_THOUSANDS_SEP:
1179 Ignore commas in numbers. For example with this flag 1,234,567
1180 is interpreted as 1234567.
1181
1182 HTS_PARSE_LIST:
1183 If present, the region is assmed to be a comma separated list and
1184 position parsing will not contain commas (this implicitly
1185 clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
1186 On success the return pointer will be the start of the next region, ie
1187 the character after the comma. (If *ret != '\0' then the caller can
1188 assume another region is present in the list.)
1189
1190 If not set then positions may contain commas. In this case the return
1191 value should point to the end of the string, or NULL on failure.
1192
1193 HTS_PARSE_ONE_COORD:
1194 If present, X:100 is treated as the single base pair region X:100-100.
1195 In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
1196 (This is the standard bcftools region convention.)
1197
1198 When not set X:100 is considered to be X:100-<end> where <end> is
1199 the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are
1200 invalid.
1201 (This is the standard samtools region convention.)
1202
1203 Note the supplied string expects 1 based inclusive coordinates, but the
1204 returned coordinates start from 0 and are half open, so pos0 is valid
1205 for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
1206
1207 If NULL is returned, the value in tid mat give additional information
1208 about the error:
1209
1210 -2 Failed to parse @p hdr; or out of memory
1211 -1 The reference in @p str has mismatched braces, or does not
1212 exist in @p hdr
1213 >= 0 The specified range in @p str could not be parsed
1214 */
1215 HTSLIB_EXPORT
1216 const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
1217 hts_pos_t *end, hts_name2id_f getid, void *hdr,
1218 int flags);
1219
1220
1221 ///////////////////////////////////////////////////////////
1222 // Generic iterators
1223 //
1224 // These functions provide the low-level infrastructure for iterators.
1225 // Wrappers around these are used to make iterators for specific file types.
1226 // See:
1227 // htslib/sam.h for SAM/BAM/CRAM iterators
1228 // htslib/vcf.h for VCF/BCF iterators
1229 // htslib/tbx.h for files indexed by tabix
1230
1231 /// Create a single-region iterator
1232 /** @param idx Index
1233 @param tid Target ID
1234 @param beg Start of region
1235 @param end End of region
1236 @param readrec Callback to read a record from the input file
1237 @return An iterator on success; NULL on failure
1238
1239 The iterator struct returned by a successful call should be freed
1240 via hts_itr_destroy() when it is no longer needed.
1241 */
1242 HTSLIB_EXPORT
1243 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec);
1244
1245 /// Free an iterator
1246 /** @param iter Iterator to free
1247 */
1248 HTSLIB_EXPORT
1249 void hts_itr_destroy(hts_itr_t *iter);
1250
1251 typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec);
1252
1253 /// Create a single-region iterator from a text region specification
1254 /** @param idx Index
1255 @param reg Region specifier
1256 @param getid Callback function to return the target ID for a name
1257 @param hdr Input file header
1258 @param itr_query Callback function returning an iterator for a numeric tid,
1259 start and end position
1260 @param readrec Callback to read a record from the input file
1261 @return An iterator on success; NULL on error
1262
1263 The iterator struct returned by a successful call should be freed
1264 via hts_itr_destroy() when it is no longer needed.
1265 */
1266 HTSLIB_EXPORT
1267 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
1268
1269 /// Return the next record from an iterator
1270 /** @param fp Input file handle
1271 @param iter Iterator
1272 @param r Pointer to record placeholder
1273 @param data Data passed to the readrec callback
1274 @return >= 0 on success, -1 when there is no more data, < -1 on error
1275 */
1276 HTSLIB_EXPORT
1277 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED;
1278
1279 /**********************************
1280 * Iterator with multiple regions *
1281 **********************************/
1282
1283 typedef int hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_t *itr);
1284 HTSLIB_EXPORT
1285 int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter);
1286 HTSLIB_EXPORT
1287 int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter);
1288
1289 /// Create a multi-region iterator from a region list
1290 /** @param idx Index
1291 @param reglist Region list
1292 @param count Number of items in region list
1293 @param getid Callback to convert names to target IDs
1294 @param hdr Indexed file header (passed to getid)
1295 @param itr_specific Filetype-specific callback function
1296 @param readrec Callback to read an input file record
1297 @param seek Callback to seek in the input file
1298 @param tell Callback to return current input file location
1299 @return An iterator on success; NULL on failure
1300
1301 The iterator struct returned by a successful call should be freed
1302 via hts_itr_destroy() when it is no longer needed.
1303 */
1304 HTSLIB_EXPORT
1305 hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell);
1306
1307 /// Return the next record from an iterator
1308 /** @param fp Input file handle
1309 @param iter Iterator
1310 @param r Pointer to record placeholder
1311 @return >= 0 on success, -1 when there is no more data, < -1 on error
1312 */
1313 HTSLIB_EXPORT
1314 int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r);
1315
1316 /// Create a region list from a char array
1317 /** @param argv Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2
1318 @param argc Number of items in the array
1319 @param r_count Pointer to the number of items in the resulting region list
1320 @param hdr Header for the sam/bam/cram file
1321 @param getid Callback to convert target names to target ids.
1322 @return A region list on success, NULL on failure
1323
1324 The hts_reglist_t struct returned by a successful call should be freed
1325 via hts_reglist_free() when it is no longer needed.
1326 */
1327 HTSLIB_EXPORT
1328 hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr, hts_name2id_f getid);
1329
1330 /// Free a region list
1331 /** @param reglist Region list
1332 @param count Number of items in the list
1333 */
1334 HTSLIB_EXPORT
1335 void hts_reglist_free(hts_reglist_t *reglist, int count);
1336
1337 /// Free a multi-region iterator
1338 /** @param iter Iterator to free
1339 */
1340 #define hts_itr_multi_destroy(iter) hts_itr_destroy(iter)
1341
1342
1343 /**
1344 * hts_file_type() - Convenience function to determine file type
1345 * DEPRECATED: This function has been replaced by hts_detect_format().
1346 * It and these FT_* macros will be removed in a future HTSlib release.
1347 */
1348 #define FT_UNKN 0
1349 #define FT_GZ 1
1350 #define FT_VCF 2
1351 #define FT_VCF_GZ (FT_GZ|FT_VCF)
1352 #define FT_BCF (1<<2)
1353 #define FT_BCF_GZ (FT_GZ|FT_BCF)
1354 #define FT_STDIN (1<<3)
1355 HTSLIB_EXPORT
1356 int hts_file_type(const char *fname);
1357
1358
1359 /***************************
1360 * Revised MAQ error model *
1361 ***************************/
1362
1363 struct errmod_t;
1364 typedef struct errmod_t errmod_t;
1365
1366 HTSLIB_EXPORT
1367 errmod_t *errmod_init(double depcorr);
1368 HTSLIB_EXPORT
1369 void errmod_destroy(errmod_t *em);
1370
1371 /*
1372 n: number of bases
1373 m: maximum base
1374 bases[i]: qual:6, strand:1, base:4
1375 q[i*m+j]: phred-scaled likelihood of (i,j)
1376 */
1377 HTSLIB_EXPORT
1378 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
1379
1380
1381 /*****************************************************
1382 * Probabilistic banded glocal alignment *
1383 * See https://doi.org/10.1093/bioinformatics/btr076 *
1384 *****************************************************/
1385
1386 typedef struct probaln_par_t {
1387 float d, e;
1388 int bw;
1389 } probaln_par_t;
1390
1391 /// Perform probabilistic banded glocal alignment
1392 /** @param ref Reference sequence
1393 @param l_ref Length of reference
1394 @param query Query sequence
1395 @param l_query Length of query sequence
1396 @param iqual Query base qualities
1397 @param c Alignment parameters
1398 @param[out] state Output alignment
1399 @param[out] q Phred scaled posterior probability of state[i] being wrong
1400 @return Phred-scaled likelihood score, or INT_MIN on failure.
1401
1402 The reference and query sequences are coded using integers 0,1,2,3,4 for
1403 bases A,C,G,T,N respectively (N here is for any ambiguity code).
1404
1405 On output, state and q are arrays of length l_query. The higher 30
1406 bits give the reference position the query base is matched to and the
1407 lower two bits can be 0 (an alignment match) or 1 (an
1408 insertion). q[i] gives the phred scaled posterior probability of
1409 state[i] being wrong.
1410
1411 On failure, errno will be set to EINVAL if the values of l_ref or l_query
1412 were invalid; or ENOMEM if a memory allocation failed.
1413 */
1414
1415 HTSLIB_EXPORT
1416 int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q);
1417
1418
1419 /**********************
1420 * MD5 implementation *
1421 **********************/
1422
1423 struct hts_md5_context;
1424 typedef struct hts_md5_context hts_md5_context;
1425
1426 /*! @abstract Initialises an MD5 context.
1427 * @discussion
1428 * The expected use is to allocate an hts_md5_context using
1429 * hts_md5_init(). This pointer is then passed into one or more calls
1430 * of hts_md5_update() to compute successive internal portions of the
1431 * MD5 sum, which can then be externalised as a full 16-byte MD5sum
1432 * calculation by calling hts_md5_final(). This can then be turned
1433 * into ASCII via hts_md5_hex().
1434 *
1435 * To dealloate any resources created by hts_md5_init() call the
1436 * hts_md5_destroy() function.
1437 *
1438 * @return hts_md5_context pointer on success, NULL otherwise.
1439 */
1440 HTSLIB_EXPORT
1441 hts_md5_context *hts_md5_init(void);
1442
1443 /*! @abstract Updates the context with the MD5 of the data. */
1444 HTSLIB_EXPORT
1445 void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size);
1446
1447 /*! @abstract Computes the final 128-bit MD5 hash from the given context */
1448 HTSLIB_EXPORT
1449 void hts_md5_final(unsigned char *digest, hts_md5_context *ctx);
1450
1451 /*! @abstract Resets an md5_context to the initial state, as returned
1452 * by hts_md5_init().
1453 */
1454 HTSLIB_EXPORT
1455 void hts_md5_reset(hts_md5_context *ctx);
1456
1457 /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
1458 * hex string.
1459 */
1460 HTSLIB_EXPORT
1461 void hts_md5_hex(char *hex, const unsigned char *digest);
1462
1463 /*! @abstract Deallocates any memory allocated by hts_md5_init. */
1464 HTSLIB_EXPORT
1465 void hts_md5_destroy(hts_md5_context *ctx);
1466
hts_reg2bin(hts_pos_t beg,hts_pos_t end,int min_shift,int n_lvls)1467 static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls)
1468 {
1469 int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
1470 for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
1471 if (beg>>s == end>>s) return t + (beg>>s);
1472 return 0;
1473 }
1474
1475 /// Compute the level of a bin in a binning index
hts_bin_level(int bin)1476 static inline int hts_bin_level(int bin) {
1477 int l, b;
1478 for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b));
1479 return l;
1480 }
1481
1482 //! Compute the corresponding entry into the linear index of a given bin from
1483 //! a binning index
1484 /*!
1485 * @param bin The bin number
1486 * @param n_lvls The index depth (number of levels - 0 based)
1487 * @return The integer offset into the linear index
1488 *
1489 * Explanation of the return value formula:
1490 * Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs.
1491 * A linear index entry covers exp(2, min_shift) base pairs.
1492 */
hts_bin_bot(int bin,int n_lvls)1493 static inline int hts_bin_bot(int bin, int n_lvls)
1494 {
1495 int l = hts_bin_level(bin);
1496 return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
1497 }
1498
1499 /**************
1500 * Endianness *
1501 **************/
1502
ed_is_big(void)1503 static inline int ed_is_big(void)
1504 {
1505 long one= 1;
1506 return !(*((char *)(&one)));
1507 }
ed_swap_2(uint16_t v)1508 static inline uint16_t ed_swap_2(uint16_t v)
1509 {
1510 return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
1511 }
ed_swap_2p(void * x)1512 static inline void *ed_swap_2p(void *x)
1513 {
1514 *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
1515 return x;
1516 }
ed_swap_4(uint32_t v)1517 static inline uint32_t ed_swap_4(uint32_t v)
1518 {
1519 v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
1520 return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
1521 }
ed_swap_4p(void * x)1522 static inline void *ed_swap_4p(void *x)
1523 {
1524 *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
1525 return x;
1526 }
ed_swap_8(uint64_t v)1527 static inline uint64_t ed_swap_8(uint64_t v)
1528 {
1529 v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
1530 v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
1531 return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
1532 }
ed_swap_8p(void * x)1533 static inline void *ed_swap_8p(void *x)
1534 {
1535 *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
1536 return x;
1537 }
1538
1539 #ifdef __cplusplus
1540 }
1541 #endif
1542
1543 #endif
1544