1from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
2from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
3from libc.stdlib cimport malloc, calloc, realloc, free
4from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
5from libc.stdio cimport FILE, printf
6from posix.types cimport off_t
7
8cdef extern from "Python.h":
9   FILE* PyFile_AsFile(object)
10
11
12# cython does not wrap stdarg
13cdef extern from "stdarg.h":
14    ctypedef struct va_list:
15        pass
16
17
18cdef extern from "htslib/kstring.h" nogil:
19    ctypedef struct kstring_t:
20        size_t l, m
21        char *s
22
23    int kputc(int c, kstring_t *s)
24    int kputw(int c, kstring_t *s)
25    int kputl(long c, kstring_t *s)
26    int ksprintf(kstring_t *s, const char *fmt, ...)
27
28
29cdef extern from "htslib_util.h" nogil:
30    int hts_set_verbosity(int verbosity)
31    int hts_get_verbosity()
32
33    ctypedef uint32_t khint32_t
34    ctypedef uint32_t khint_t
35    ctypedef khint_t  khiter_t
36
37    # Used to manage BCF Header info
38    ctypedef struct vdict_t:
39        khint_t n_buckets, size, n_occupied, upper_bound
40        khint32_t *flags
41        const char *keys
42        bcf_idinfo_t *vals
43
44    # Used to manage indexed contigs in Tabix
45    ctypedef struct s2i_t:
46        khint_t n_buckets, size, n_occupied, upper_bound
47        khint32_t *flags
48        const char *keys
49        int64_t *vals
50
51    # Generic khash methods
52    khint_t kh_size(void *d)
53    khint_t kh_begin(void *d)
54    khint_t kh_end(void *d)
55    int kh_exist(void *d, khiter_t i)
56
57    # Specialized khash methods for vdict
58    khint_t kh_get_vdict(vdict_t *d, const char *key)
59    const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
60    bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
61
62
63cdef extern from "htslib/hfile.h" nogil:
64    ctypedef struct hFILE
65
66    # @abstract  Open the named file or URL as a stream
67    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
68    hFILE *hopen(const char *filename, const char *mode, ...)
69
70    # @abstract  Associate a stream with an existing open file descriptor
71    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
72    # @notes     For socket descriptors (on Windows), mode should contain 's'.
73    hFILE *hdopen(int fd, const char *mode)
74
75    # @abstract  Report whether the file name or URL denotes remote storage
76    # @return    0 if local, 1 if remote.
77    # @notes     "Remote" means involving e.g. explicit network access, with the
78    #   implication that callers may wish to cache such files' contents locally.
79    int hisremote(const char *filename)
80
81    # @abstract  Flush (for output streams) and close the stream
82    # @return    0 if successful, or EOF (with errno set) if an error occurred.
83    int hclose(hFILE *fp)
84
85    # @abstract  Close the stream, without flushing or propagating errors
86    # @notes     For use while cleaning up after an error only.  Preserves errno.
87    void hclose_abruptly(hFILE *fp)
88
89    # @abstract  Return the stream's error indicator
90    # @return    Non-zero (in fact, an errno value) if an error has occurred.
91    # @notes     This would be called herror() and return true/false to parallel
92    #   ferror(3), but a networking-related herror(3) function already exists.  */
93    int herrno(hFILE *fp)
94
95    # @abstract  Clear the stream's error indicator
96    void hclearerr(hFILE *fp)
97
98    # @abstract  Reposition the read/write stream offset
99    # @return    The resulting offset within the stream (as per lseek(2)),
100    #   or negative if an error occurred.
101    off_t hseek(hFILE *fp, off_t offset, int whence)
102
103    # @abstract  Report the current stream offset
104    # @return    The offset within the stream, starting from zero.
105    off_t htell(hFILE *fp)
106
107    # @abstract  Read one character from the stream
108    # @return    The character read, or EOF on end-of-file or error
109    int hgetc(hFILE *fp)
110
111    # Read from the stream until the delimiter, up to a maximum length
112    #    @param buffer  The buffer into which bytes will be written
113    #    @param size    The size of the buffer
114    #    @param delim   The delimiter (interpreted as an `unsigned char`)
115    #    @param fp      The file stream
116    #    @return  The number of bytes read, or negative on error.
117    #    @since   1.4
118    #
119    # Bytes will be read into the buffer up to and including a delimiter, until
120    # EOF is reached, or _size-1_ bytes have been written, whichever comes first.
121    # The string will then be terminated with a NUL byte (`\0`).
122    ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
123
124    # Read a line from the stream, up to a maximum length
125    #    @param buffer  The buffer into which bytes will be written
126    #    @param size    The size of the buffer
127    #    @param fp      The file stream
128    #    @return  The number of bytes read, or negative on error.
129    #    @since   1.4
130    #
131    # Specialization of hgetdelim() for a `\n` delimiter.
132    ssize_t hgetln(char *buffer, size_t size, hFILE *fp)
133
134    # Read a line from the stream, up to a maximum length
135    #    @param buffer  The buffer into which bytes will be written
136    #    @param size    The size of the buffer (must be > 1 to be useful)
137    #    @param fp      The file stream
138    #    @return  _buffer_ on success, or `NULL` if an error occurred.
139    #    @since   1.4
140    #
141    # This function can be used as a replacement for `fgets(3)`, or together with
142    # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_.
143    char *hgets(char *buffer, int size, hFILE *fp)
144
145    # @abstract  Peek at characters to be read without removing them from buffers
146    # @param fp      The file stream
147    # @param buffer  The buffer to which the peeked bytes will be written
148    # @param nbytes  The number of bytes to peek at; limited by the size of the
149    #   internal buffer, which could be as small as 4K.
150    # @return    The number of bytes peeked, which may be less than nbytes if EOF
151    #   is encountered; or negative, if there was an I/O error.
152    # @notes  The characters peeked at remain in the stream's internal buffer,
153    #   and will be returned by later hread() etc calls.
154    ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
155
156    # @abstract  Read a block of characters from the file
157    # @return    The number of bytes read, or negative if an error occurred.
158    # @notes     The full nbytes requested will be returned, except as limited
159    #   by EOF or I/O errors.
160    ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
161
162    # @abstract  Write a character to the stream
163    # @return    The character written, or EOF if an error occurred.
164    int hputc(int c, hFILE *fp)
165
166    # @abstract  Write a string to the stream
167    # @return    0 if successful, or EOF if an error occurred.
168    int hputs(const char *text, hFILE *fp)
169
170    # @abstract  Write a block of characters to the file
171    # @return    Either nbytes, or negative if an error occurred.
172    # @notes     In the absence of I/O errors, the full nbytes will be written.
173    ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
174
175    # @abstract  For writing streams, flush buffered output to the underlying stream
176    # @return    0 if successful, or EOF if an error occurred.
177    int hflush(hFILE *fp)
178
179
180cdef extern from "htslib/bgzf.h" nogil:
181    ctypedef struct bgzf_mtaux_t
182    ctypedef struct bgzidx_t
183    ctypedef struct z_stream
184
185    ctypedef struct BGZF:
186        unsigned           errcode
187        unsigned           is_write
188        int           is_be
189        int           compress_level
190        int           is_compressed
191        int           is_gzip
192        int           cache_size
193        int64_t       block_address
194        int64_t       uncompressed_address
195        void         *uncompressed_block
196        void         *compressed_block
197        void         *cache
198        hFILE        *fp
199        bgzf_mtaux_t *mt
200        bgzidx_t     *idx
201        int           idx_build_otf
202        z_stream     *gz_stream
203
204    #*****************
205    #  Basic routines *
206    # *****************/
207
208    #  Open an existing file descriptor for reading or writing.
209    #
210    #  @param fd    file descriptor
211    #  @param mode  mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
212    #               writing, 'a' for appending, 'g' for gzip rather than BGZF
213    #               compression (with 'w' only), and digit specifies the zlib
214    #               compression level.
215    #               Note that there is a distinction between 'u' and '0': the
216    #               first yields plain uncompressed output whereas the latter
217    #               outputs uncompressed data wrapped in the zlib format.
218    #  @return      BGZF file handler; 0 on error
219
220    BGZF* bgzf_dopen(int fd, const char *mode)
221    BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
222
223    #  Open the specified file for reading or writing.
224    BGZF* bgzf_open(const char* path, const char *mode)
225
226    #  Open an existing hFILE stream for reading or writing.
227    BGZF* bgzf_hopen(hFILE *fp, const char *mode)
228
229    #  Close the BGZF and free all associated resources.
230    #
231    #  @param fp    BGZF file handler
232    #  @return      0 on success and -1 on error
233    int bgzf_close(BGZF *fp)
234
235    #  Read up to _length_ bytes from the file storing into _data_.
236    #
237    #  @param fp     BGZF file handler
238    #  @param data   data array to read into
239    #  @param length size of data to read
240    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
241    ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
242
243    #  Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
244    #  the complete _length_ bytes will be written (or queued for writing).
245    #
246    #  @param fp     BGZF file handler
247    #  @param data   data array to write
248    #  @param length size of data to write
249    #  @return       number of bytes written (i.e., _length_); negative on error
250    ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
251
252    #  Read up to _length_ bytes directly from the underlying stream without
253    #  decompressing.  Bypasses BGZF blocking, so must be used with care in
254    #  specialised circumstances only.
255    #
256    #  @param fp     BGZF file handler
257    #  @param data   data array to read into
258    #  @param length number of raw bytes to read
259    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
260    ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
261
262    #  Write _length_ bytes directly to the underlying stream without
263    #  compressing.  Bypasses BGZF blocking, so must be used with care
264    #  in specialised circumstances only.
265    #
266    #  @param fp     BGZF file handler
267    #  @param data   data array to write
268    #  @param length number of raw bytes to write
269    #  @return       number of bytes actually written; -1 on error
270    ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
271
272    #  Write the data in the buffer to the file.
273    int bgzf_flush(BGZF *fp)
274
275    int SEEK_SET
276
277    #  Return a virtual file pointer to the current location in the file.
278    #  No interpretation of the value should be made, other than a subsequent
279    #  call to bgzf_seek can be used to position the file at the same point.
280    #  Return value is non-negative on success.
281    int64_t bgzf_tell(BGZF *fp)
282
283    #  Set the file to read from the location specified by _pos_.
284    #
285    #  @param fp     BGZF file handler
286    #  @param pos    virtual file offset returned by bgzf_tell()
287    #  @param whence must be SEEK_SET
288    #  @return       0 on success and -1 on error
289    # /
290    int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
291
292    #  Check if the BGZF end-of-file (EOF) marker is present
293    #
294    #  @param fp    BGZF file handler opened for reading
295    #  @return      1 if the EOF marker is present and correct
296    #               2 if it can't be checked, e.g., because fp isn't seekable
297    #               0 if the EOF marker is absent
298    #               -1 (with errno set) on error
299    int bgzf_check_EOF(BGZF *fp)
300
301    #  Check if a file is in the BGZF format
302    #
303    #  @param fn    file name
304    #  @return      1 if _fn_ is BGZF; 0 if not or on I/O error
305    int bgzf_is_bgzf(const char *fn)
306
307    #*********************
308    #  Advanced routines *
309    #*********************
310
311    #  Set the cache size. Only effective when compiled with -DBGZF_CACHE.
312    #
313    #  @param fp    BGZF file handler
314    #  @param size  size of cache in bytes; 0 to disable caching (default)
315    void bgzf_set_cache_size(BGZF *fp, int size)
316
317    #  Flush the file if the remaining buffer size is smaller than _size_
318    #  @return      0 if flushing succeeded or was not needed; negative on error
319    int bgzf_flush_try(BGZF *fp, ssize_t size)
320
321    #  Read one byte from a BGZF file. It is faster than bgzf_read()
322    #  @param fp     BGZF file handler
323    #  @return       byte read; -1 on end-of-file or error
324    int bgzf_getc(BGZF *fp)
325
326    #  Read one line from a BGZF file. It is faster than bgzf_getc()
327    #
328    #  @param fp     BGZF file handler
329    #  @param delim  delimiter
330    #  @param str    string to write to; must be initialized
331    #  @return       length of the string; 0 on end-of-file; negative on error
332    int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
333
334    #  Read the next BGZF block.
335    int bgzf_read_block(BGZF *fp)
336
337    #  Enable multi-threading (only effective on writing and when the
338    #  library was compiled with -DBGZF_MT)
339    #
340    #  @param fp          BGZF file handler; must be opened for writing
341    #  @param n_threads   #threads used for writing
342    #  @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
343    int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
344
345
346    # Compress a single BGZF block.
347    #
348    # @param dst    output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
349    # @param dlen   size of output buffer; updated on return to the number
350    #               of bytes actually written to dst
351    # @param src    buffer to be compressed
352    # @param slen   size of data to compress (must be <= BGZF_BLOCK_SIZE)
353    # @param level  compression level
354    # @return       0 on success and negative on error
355    #
356    int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
357
358    #*******************
359    #  bgzidx routines *
360    #   BGZF at the uncompressed offset
361    #
362    #   @param fp           BGZF file handler; must be opened for reading
363    #   @param uoffset      file offset in the uncompressed data
364    #   @param where        SEEK_SET supported atm
365    #
366    #   Returns 0 on success and -1 on error.
367    int bgzf_useek(BGZF *fp, long uoffset, int where)
368
369    #   Position in uncompressed BGZF
370    #
371    #   @param fp           BGZF file handler; must be opened for reading
372    #
373    #   Returns the current offset on success and -1 on error.
374    long bgzf_utell(BGZF *fp)
375
376    #  Tell BGZF to build index while compressing.
377    #
378    #  @param fp          BGZF file handler; can be opened for reading or writing.
379    #
380    #  Returns 0 on success and -1 on error.
381    int bgzf_index_build_init(BGZF *fp)
382
383    #  Load BGZF index
384    #
385    #  @param fp          BGZF file handler
386    #  @param bname       base name
387    #  @param suffix      suffix to add to bname (can be NULL)
388    #
389    #  Returns 0 on success and -1 on error.
390    int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
391
392    #  Save BGZF index
393    #
394    #  @param fp          BGZF file handler
395    #  @param bname       base name
396    #  @param suffix      suffix to add to bname (can be NULL)
397    #
398    #  Returns 0 on success and -1 on error.
399    int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
400
401
402cdef extern from "htslib/hts.h" nogil:
403    uint32_t kroundup32(uint32_t x)
404
405    ctypedef struct cram_fd
406
407    union FilePointerUnion:
408        BGZF    *bgzf
409        cram_fd *cram
410        hFILE   *hfile
411        void    *voidp
412
413    enum htsFormatCategory:
414        unknown_category
415        sequence_data    # Sequence data -- SAM, BAM, CRAM, etc
416        variant_data     # Variant calling data -- VCF, BCF, etc
417        index_file       # Index file associated with some data file
418        region_list      # Coordinate intervals or regions -- BED, etc
419        category_maximum
420
421    enum htsExactFormat:
422        unknown_format
423        binary_format
424        text_format
425        sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
426        format_maximum
427
428    enum htsCompression:
429        no_compression, gzip, bgzf, custom
430        compression_maximum
431
432    cdef enum hts_fmt_option:
433        CRAM_OPT_DECODE_MD,
434        CRAM_OPT_PREFIX,
435        CRAM_OPT_VERBOSITY,
436        CRAM_OPT_SEQS_PER_SLICE,
437        CRAM_OPT_SLICES_PER_CONTAINER,
438        CRAM_OPT_RANGE,
439        CRAM_OPT_VERSION,
440        CRAM_OPT_EMBED_REF,
441        CRAM_OPT_IGNORE_MD5,
442        CRAM_OPT_REFERENCE,
443        CRAM_OPT_MULTI_SEQ_PER_SLICE,
444        CRAM_OPT_NO_REF,
445        CRAM_OPT_USE_BZIP2,
446        CRAM_OPT_SHARED_REF,
447        CRAM_OPT_NTHREADS,
448        CRAM_OPT_THREAD_POOL,
449        CRAM_OPT_USE_LZMA,
450        CRAM_OPT_USE_RANS,
451        CRAM_OPT_REQUIRED_FIELDS,
452        HTS_OPT_COMPRESSION_LEVEL,
453        HTS_OPT_NTHREADS,
454
455    ctypedef struct htsVersion:
456        short major, minor
457
458    ctypedef struct htsFormat:
459        htsFormatCategory category
460        htsExactFormat    format
461        htsVersion        version
462        htsCompression    compression
463        short             compression_level
464        void              *specific
465
466    ctypedef struct htsFile:
467        uint8_t  is_bin
468        uint8_t  is_write
469        uint8_t  is_be
470        uint8_t  is_cram
471        int64_t lineno
472        kstring_t line
473        char *fn
474        char *fn_aux
475        FilePointerUnion fp
476        htsFormat format
477
478    int hts_verbose
479
480    cdef union hts_opt_val_union:
481        int i
482        char *s
483
484    ctypedef struct hts_opt:
485        char *arg
486        hts_fmt_option opt
487        hts_opt_val_union val
488        void *next
489
490    # @abstract Parses arg and appends it to the option list.
491    # @return   0 on success and -1 on failure
492    int hts_opt_add(hts_opt **opts, const char *c_arg)
493
494    # @abstract Applies an hts_opt option list to a given htsFile.
495    # @return   0 on success and -1 on failure
496    int hts_opt_apply(htsFile *fp, hts_opt *opts)
497
498    # @abstract Frees an hts_opt list.
499    void hts_opt_free(hts_opt *opts)
500
501    # @abstract Table for converting a nucleotide character to 4-bit encoding.
502    # The input character may be either an IUPAC ambiguity code, '=' for 0, or
503    # '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
504    # for A/C/G/T or combinations of these bits for ambiguous bases.
505    const unsigned char *seq_nt16_table
506
507    # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
508    # ambiguity code letter (or '=' when given 0).
509    const char *seq_nt16_str
510
511    # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
512    # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
513    const int *seq_nt16_int
514
515    # @abstract  Get the htslib version number
516    # @return    For released versions, a string like "N.N[.N]"; or git describe
517    # output if using a library built within a Git repository.
518    const char *hts_version()
519
520    # @abstract    Determine format by peeking at the start of a file
521    # @param fp    File opened for reading, positioned at the beginning
522    # @param fmt   Format structure that will be filled out on return
523    # @return      0 for success, or negative if an error occurred.
524    int hts_detect_format(hFILE *fp, htsFormat *fmt)
525
526    # @abstract    Get a human-readable description of the file format
527    # @return      Description string, to be freed by the caller after use.
528    char *hts_format_description(const htsFormat *format)
529
530    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
531    # @param fn       The file name or "-" for stdin/stdout
532    # @param mode     Mode matching / [rwa][bceguxz0-9]* /
533    # @discussion
534    #     With 'r' opens for reading; any further format mode letters are ignored
535    #     as the format is detected by checking the first few bytes or BGZF blocks
536    #     of the file.  With 'w' or 'a' opens for writing or appending, with format
537    #     specifier letters:
538    #       b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
539    #       c  CRAM format
540    #       g  gzip compressed
541    #       u  uncompressed
542    #       z  bgzf compressed
543    #       [0-9]  zlib compression level
544    #     and with non-format option letters (for any of 'r'/'w'/'a'):
545    #       e  close the file on exec(2) (opens with O_CLOEXEC, where supported)
546    #       x  create the file exclusively (opens with O_EXCL, where supported)
547    #     Note that there is a distinction between 'u' and '0': the first yields
548    #     plain uncompressed output whereas the latter outputs uncompressed data
549    #     wrapped in the zlib format.
550    # @example
551    #     [rw]b  .. compressed BCF, BAM, FAI
552    #     [rw]bu .. uncompressed BCF
553    #     [rw]z  .. compressed VCF
554    #     [rw]   .. uncompressed VCF
555    htsFile *hts_open(const char *fn, const char *mode)
556
557    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
558    # @param fn       The file name or "-" for stdin/stdout
559    # @param mode     Open mode, as per hts_open()
560    # @param fmt      Optional format specific parameters
561    # @discussion
562    #     See hts_open() for description of fn and mode.
563    #     // TODO Update documentation for s/opts/fmt/
564    #     Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
565    #     if defined, override mode.  Opts also contains a linked list of hts_opt
566    #     structures to apply to the open file handle.  These can contain things
567    #     like pointers to the reference or information on compression levels,
568    #     block sizes, etc.
569    htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
570
571    # @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
572    # @param fp       The already-open file handle
573    # @param fn       The file name or "-" for stdin/stdout
574    # @param mode     Open mode, as per hts_open()
575    htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
576
577    # @abstract  Close a file handle, flushing buffered data for output streams
578    # @param fp  The file handle to be closed
579    # @return    0 for success, or negative if an error occurred.
580    int hts_close(htsFile *fp)
581
582    # @abstract  Returns the file's format information
583    # @param fp  The file handle
584    # @return    Read-only pointer to the file's htsFormat.
585    const htsFormat *hts_get_format(htsFile *fp)
586
587    # @ abstract      Returns a string containing the file format extension.
588    # @ param format  Format structure containing the file type.
589    # @ return        A string ("sam", "bam", etc) or "?" for unknown formats.
590    const char *hts_format_file_extension(const htsFormat *format)
591
592    # @abstract  Sets a specified CRAM option on the open file handle.
593    # @param fp  The file handle open the open file.
594    # @param opt The CRAM_OPT_* option.
595    # @param ... Optional arguments, dependent on the option used.
596    # @return    0 for success, or negative if an error occurred.
597    int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
598
599    int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
600    char **hts_readlines(const char *fn, int *_n)
601
602    #   @abstract       Parse comma-separated list or read list from a file
603    #   @param list     File name or comma-separated list
604    #   @param is_file
605    #   @param _n       Size of the output array (number of items read)
606    #   @return         NULL on failure or pointer to newly allocated array of
607    #                   strings
608    char **hts_readlist(const char *fn, int is_file, int *_n)
609
610    # @abstract  Create extra threads to aid compress/decompression for this file
611    # @param fp  The file handle
612    # @param n   The number of worker threads to create
613    # @return    0 for success, or negative if an error occurred.
614    # @notes     THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
615    int hts_set_threads(htsFile *fp, int n)
616
617    # @abstract  Set .fai filename for a file opened for reading
618    # @return    0 for success, negative on failure
619    # @discussion
620    #     Called before *_hdr_read(), this provides the name of a .fai file
621    #     used to provide a reference list if the htsFile contains no @SQ headers.
622    int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
623
624    int8_t HTS_IDX_NOCOOR
625    int8_t HTS_IDX_START
626    int8_t HTS_IDX_REST
627    int8_t HTS_IDX_NONE
628
629    int8_t HTS_FMT_CSI
630    int8_t HTS_FMT_BAI
631    int8_t HTS_FMT_TBI
632    int8_t HTS_FMT_CRAI
633
634    BGZF *hts_get_bgzfp(htsFile *fp)
635
636    ctypedef struct hts_idx_t
637
638    ctypedef struct hts_pair64_t:
639        uint64_t u, v
640
641    ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
642
643    ctypedef struct hts_bins_t:
644        int n, m
645        int *a
646
647    ctypedef struct hts_itr_t:
648        uint32_t read_rest
649        uint32_t finished
650        int tid, bed, end, n_off, i
651        int curr_tid, curr_beg, curr_end
652        uint64_t curr_off
653        hts_pair64_t *off
654        hts_readrec_func *readfunc
655        hts_bins_t bins
656
657    hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
658    void hts_idx_destroy(hts_idx_t *idx)
659    int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
660    void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
661
662    #### Save an index to a file
663    #    @param idx  Index to be written
664    #    @param fn   Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
665    #    @param fmt  One of the HTS_FMT_* index formats
666    #    @return  0 if successful, or negative if an error occurred.
667    int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
668
669    #### Save an index to a specific file
670    #    @param idx    Index to be written
671    #    @param fn     Input BAM/BCF/etc filename
672    #    @param fnidx  Output filename, or NULL to add .bai/.csi/etc to @a fn
673    #    @param fmt    One of the HTS_FMT_* index formats
674    #    @return  0 if successful, or negative if an error occurred.
675    int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
676
677    #### Load an index file
678    #    @param fn   BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
679    #                the extension substituted, to search for an existing index file
680    #    @param fmt  One of the HTS_FMT_* index formats
681    #    @return  The index, or NULL if an error occurred.
682    hts_idx_t *hts_idx_load(const char *fn, int fmt)
683
684    #### Load a specific index file
685    #    @param fn     Input BAM/BCF/etc filename
686    #    @param fnidx  The input index filename
687    #    @return  The index, or NULL if an error occurred.
688    hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
689
690    uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
691    void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
692
693    int hts_idx_get_stat(const hts_idx_t* idx, int tid,
694                         uint64_t* mapped, uint64_t* unmapped)
695
696    uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
697
698    int HTS_PARSE_THOUSANDS_SEP  # Ignore ',' separators within numbers
699
700    # Parse a numeric string
701    #    The number may be expressed in scientific notation, and optionally may
702    #    contain commas in the integer part (before any decimal point or E notation).
703    #    @param str     String to be parsed
704    #    @param strend  If non-NULL, set on return to point to the first character
705    #                   in @a str after those forming the parsed number
706    #    @param flags   Or'ed-together combination of HTS_PARSE_* flags
707    #    @return  Converted value of the parsed number.
708    #
709    #    When @a strend is NULL, a warning will be printed (if hts_verbose is 2
710    #    or more) if there are any trailing characters after the number.
711    long long hts_parse_decimal(const char *str, char **strend, int flags)
712
713    # Parse a "CHR:START-END"-style region string
714    #    @param str  String to be parsed
715    #    @param beg  Set on return to the 0-based start of the region
716    #    @param end  Set on return to the 1-based end of the region
717    #    @return  Pointer to the colon or '\0' after the reference sequence name,
718    #             or NULL if @a str could not be parsed.
719    const char *hts_parse_reg(const char *str, int *beg, int *end)
720
721    hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
722    void hts_itr_destroy(hts_itr_t *iter)
723
724    ctypedef int (*hts_name2id_f)(void*, const char*)
725    ctypedef const char *(*hts_id2name_f)(void*, int)
726    ctypedef hts_itr_t *hts_itr_query_func(
727        const hts_idx_t *idx,
728        int tid,
729        int beg,
730        int end,
731        hts_readrec_func *readrec)
732
733    hts_itr_t *hts_itr_querys(
734        const hts_idx_t *idx,
735        const char *reg,
736        hts_name2id_f getid,
737        void *hdr,
738        hts_itr_query_func *itr_query,
739        hts_readrec_func *readrec)
740
741    int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
742    const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)  # free only the array, not the values
743
744    # hts_file_type() - Convenience function to determine file type
745    # @fname: the file name
746    #
747    # Returns one of the FT_* defines.
748    #
749    # DEPRECATED:  This function has been replaced by hts_detect_format().
750    # It and these FT_* macros will be removed in a future HTSlib release.
751    int FT_UNKN
752    int FT_GZ
753    int FT_VCF
754    int FT_VCF_GZ
755    int FT_BCF
756    int FT_BCF_GZ
757    int FT_STDIN
758
759    int hts_file_type(const char *fname)
760
761    # /***************************
762    #  * Revised MAQ error model *
763    #  ***************************/
764
765    ctypedef struct errmod_t
766
767    errmod_t *errmod_init(double depcorr)
768    void errmod_destroy(errmod_t *em)
769
770    # /*
771    #     n: number of bases
772    #     m: maximum base
773    #     bases[i]: qual:6, strand:1, base:4
774    #     q[i*m+j]: phred-scaled likelihood of (i,j)
775    #  */
776    int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic)
777
778    # /*****************************************
779    #  * q banded glocal alignment *
780    #  *****************************************/
781
782    ctypedef struct probaln_par_t:
783        float d, e
784        int bw
785
786    int probaln_glocal(const uint8_t *ref,
787                       int l_ref,
788                       const uint8_t *query,
789                       int l_query, const uint8_t *iqual,
790                       const probaln_par_t *c,
791                       int *state, uint8_t *q)
792
793    # /**********************
794    #  * MD5 implementation *
795    #  **********************/
796
797    ctypedef struct hts_md5_context
798
799    # /*! @abstract   Initialises an MD5 context.
800    #  *  @discussion
801    #  *    The expected use is to allocate an hts_md5_context using
802    #  *    hts_md5_init().  This pointer is then passed into one or more calls
803    #  *    of hts_md5_update() to compute successive internal portions of the
804    #  *    MD5 sum, which can then be externalised as a full 16-byte MD5sum
805    #  *    calculation by calling hts_md5_final().  This can then be turned
806    #  *    into ASCII via hts_md5_hex().
807    #  *
808    #  *    To dealloate any resources created by hts_md5_init() call the
809    #  *    hts_md5_destroy() function.
810    #  *
811    #  *  @return     hts_md5_context pointer on success, NULL otherwise.
812    #  */
813    hts_md5_context *hts_md5_init()
814
815    # /*! @abstract Updates the context with the MD5 of the data. */
816    void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
817
818    # /*! @abstract Computes the final 128-bit MD5 hash from the given context */
819    void hts_md5_final(unsigned char *digest, hts_md5_context *ctx)
820
821    # /*! @abstract Resets an md5_context to the initial state, as returned
822    #  *            by hts_md5_init().
823    #  */
824    void hts_md5_reset(hts_md5_context *ctx)
825
826    # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
827    #  *            hex string.
828    #  */
829    void hts_md5_hex(char *hex, const unsigned char *digest)
830
831    # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
832    void hts_md5_destroy(hts_md5_context *ctx)
833
834    int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
835    int hts_bin_bot(int bin, int n_lvls)
836
837    # * Endianness *
838    int ed_is_big()
839    uint16_t ed_swap_2(uint16_t v)
840    void *ed_swap_2p(void *x)
841    uint32_t ed_swap_4(uint32_t v)
842    void *ed_swap_4p(void *x)
843    uint64_t ed_swap_8(uint64_t v)
844    void *ed_swap_8p(void *x)
845
846
847cdef extern from "htslib/sam.h" nogil:
848    #**********************
849    #*** SAM/BAM header ***
850    #**********************
851
852    # @abstract Structure for the alignment header.
853    # @field n_targets   number of reference sequences
854    # @field l_text      length of the plain text in the header
855    # @field target_len  lengths of the reference sequences
856    # @field target_name names of the reference sequences
857    # @field text        plain text
858    # @field sdict       header dictionary
859
860    ctypedef struct bam_hdr_t:
861         int32_t n_targets, ignore_sam_err
862         uint32_t l_text
863         uint32_t *target_len
864         uint8_t *cigar_tab
865         char **target_name
866         char *text
867         void *sdict
868
869    #****************************
870    #*** CIGAR related macros ***
871    #****************************
872
873    int BAM_CMATCH
874    int BAM_CINS
875    int BAM_CDEL
876    int BAM_CREF_SKIP
877    int BAM_CSOFT_CLIP
878    int BAM_CHARD_CLIP
879    int BAM_CPAD
880    int BAM_CEQUAL
881    int BAM_CDIFF
882    int BAM_CBACK
883
884    char    *BAM_CIGAR_STR
885    int      BAM_CIGAR_SHIFT
886    uint32_t BAM_CIGAR_MASK
887    uint32_t BAM_CIGAR_TYPE
888
889    char bam_cigar_op(uint32_t c)
890    uint32_t bam_cigar_oplen(uint32_t c)
891    char bam_cigar_opchr(uint32_t)
892    uint32_t bam_cigar_gen(char, uint32_t)
893    int bam_cigar_type(char o)
894
895    # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
896    int BAM_FPAIRED
897    # @abstract the read is mapped in a proper pair
898    int BAM_FPROPER_PAIR
899    # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
900    int BAM_FUNMAP
901    # @abstract the mate is unmapped
902    int BAM_FMUNMAP
903    # @abstract the read is mapped to the reverse strand
904    int BAM_FREVERSE
905    # @abstract the mate is mapped to the reverse strand
906    int BAM_FMREVERSE
907    # @abstract this is read1
908    int BAM_FREAD1
909    # @abstract this is read2
910    int BAM_FREAD2
911    # @abstract not primary alignment
912    int BAM_FSECONDARY
913    # @abstract QC failure
914    int BAM_FQCFAIL
915    # @abstract optical or PCR duplicate
916    int BAM_FDUP
917    # @abstract supplementary alignment
918    int BAM_FSUPPLEMENTARY
919
920    #*************************
921    #*** Alignment records ***
922    #*************************
923
924    # @abstract Structure for core alignment information.
925    # @field  tid     chromosome ID, defined by bam_hdr_t
926    # @field  pos     0-based leftmost coordinate
927    # @field  bin     bin calculated by bam_reg2bin()
928    # @field  qual    mapping quality
929    # @field  l_qname length of the query name
930    # @field  flag    bitwise flag
931    # @field  n_cigar number of CIGAR operations
932    # @field  l_qseq  length of the query sequence (read)
933    # @field  mtid    chromosome ID of next read in template, defined by bam_hdr_t
934    # @field  mpos    0-based leftmost coordinate of next read in template
935
936    ctypedef struct bam1_core_t:
937        int32_t tid
938        int32_t pos
939        uint16_t bin
940        uint8_t qual
941        uint8_t l_qname
942        uint16_t flag
943        uint8_t unused1
944        uint8_t l_extranul
945        uint32_t n_cigar
946        int32_t l_qseq
947        int32_t mtid
948        int32_t mpos
949        int32_t isize
950
951    # @abstract Structure for one alignment.
952    # @field  core       core information about the alignment
953    # @field  l_data     current length of bam1_t::data
954    # @field  m_data     maximum length of bam1_t::data
955    # @field  data       all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
956    #
957    # @discussion Notes:
958    #
959    # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
960    # 2. l_qseq is calculated from the total length of an alignment block
961    # on reading or from CIGAR.
962    # 3. cigar data is encoded 4 bytes per CIGAR operation.
963    # 4. seq is nybble-encoded according to seq_nt16_table.
964    ctypedef struct bam1_t:
965        bam1_core_t core
966        int l_data
967        uint32_t m_data
968        uint8_t *data
969        uint64_t id
970
971    # @abstract  Get whether the query is on the reverse strand
972    # @param  b  pointer to an alignment
973    # @return    boolean true if query is on the reverse strand
974    int bam_is_rev(bam1_t *b)
975
976    # @abstract  Get whether the query's mate is on the reverse strand
977    # @param  b  pointer to an alignment
978    # @return    boolean true if query's mate on the reverse strand
979    int bam_is_mrev(bam1_t *b)
980
981    # @abstract  Get the name of the query
982    # @param  b  pointer to an alignment
983    # @return    pointer to the name string, null terminated
984    char *bam_get_qname(bam1_t *b)
985
986    # @abstract  Get the CIGAR array
987    # @param  b  pointer to an alignment
988    # @return    pointer to the CIGAR array
989    #
990    # @discussion In the CIGAR array, each element is a 32-bit integer. The
991    # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
992    # length of a CIGAR.
993    uint32_t *bam_get_cigar(bam1_t *b)
994
995    # @abstract  Get query sequence
996    # @param  b  pointer to an alignment
997    # @return    pointer to sequence
998    #
999    # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
1000    # 8 for T and 15 for N. Two bases are packed in one byte with the base
1001    # at the higher 4 bits having smaller coordinate on the read. It is
1002    # recommended to use bam_seqi() macro to get the base.
1003    char *bam_get_seq(bam1_t *b)
1004
1005    # @abstract  Get query quality
1006    # @param  b  pointer to an alignment
1007    # @return    pointer to quality string
1008    uint8_t *bam_get_qual(bam1_t *b)
1009
1010    # @abstract  Get auxiliary data
1011    # @param  b  pointer to an alignment
1012    # @return    pointer to the concatenated auxiliary data
1013    uint8_t *bam_get_aux(bam1_t *b)
1014
1015    # @abstract  Get length of auxiliary data
1016    # @param  b  pointer to an alignment
1017    # @return    length of the concatenated auxiliary data
1018    int bam_get_l_aux(bam1_t *b)
1019
1020    # @abstract  Get a base on read
1021    # @param  s  Query sequence returned by bam1_seq()
1022    # @param  i  The i-th position, 0-based
1023    # @return    4-bit integer representing the base.
1024    char bam_seqi(char *s, int i)
1025
1026    #**************************
1027    #*** Exported functions ***
1028    #**************************
1029
1030    #***************
1031    #*** BAM I/O ***
1032    #***************
1033
1034    bam_hdr_t *bam_hdr_init()
1035    bam_hdr_t *bam_hdr_read(BGZF *fp)
1036    int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
1037    void bam_hdr_destroy(bam_hdr_t *h)
1038    int bam_name2id(bam_hdr_t *h, const char *ref)
1039    bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
1040
1041    bam1_t *bam_init1()
1042    void bam_destroy1(bam1_t *b)
1043    int bam_read1(BGZF *fp, bam1_t *b)
1044    int bam_write1(BGZF *fp, const bam1_t *b)
1045    bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
1046    bam1_t *bam_dup1(const bam1_t *bsrc)
1047
1048    int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
1049    int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
1050
1051    # @abstract Calculate the rightmost base position of an alignment on the
1052    # reference genome.
1053
1054    # @param  b  pointer to an alignment
1055    # @return    the coordinate of the first base after the alignment, 0-based
1056
1057    # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
1058    # For an unmapped read (either according to its flags or if it has no cigar
1059    # string), we return b->core.pos + 1 by convention.
1060    int32_t bam_endpos(const bam1_t *b)
1061
1062    int   bam_str2flag(const char *str)  # returns negative value on error
1063    char *bam_flag2str(int flag)         # The string must be freed by the user
1064
1065    #*************************
1066    #*** BAM/CRAM indexing ***
1067    #*************************
1068
1069    # These BAM iterator functions work only on BAM files.  To work with either
1070    # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
1071    void bam_itr_destroy(hts_itr_t *iter)
1072    hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
1073    hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
1074    int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
1075
1076    # Load/build .csi or .bai BAM index file.  Does not work with CRAM.
1077    # It is recommended to use the sam_index_* functions below instead.
1078    hts_idx_t *bam_index_load(const char *fn)
1079    int bam_index_build(const char *fn, int min_shift)
1080
1081    # Load a BAM (.csi or .bai) or CRAM (.crai) index file
1082    # @param fp  File handle of the data file whose index is being opened
1083    # @param fn  BAM/CRAM/etc filename to search alongside for the index file
1084    # @return  The index, or NULL if an error occurred.
1085    hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1086
1087    # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
1088    # @param fp     File handle of the data file whose index is being opened
1089    # @param fn     BAM/CRAM/etc data file filename
1090    # @param fnidx  Index filename, or NULL to search alongside @a fn
1091    # @return  The index, or NULL if an error occurred.
1092    hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
1093
1094    # Generate and save an index file
1095    # @param fn        Input BAM/etc filename, to which .csi/etc will be added
1096    # @param min_shift Positive to generate CSI, or 0 to generate BAI
1097    # @return  0 if successful, or negative if an error occurred (usually -1; or
1098    #         -2: opening fn failed; -3: format not indexable)
1099    int sam_index_build(const char *fn, int min_shift)
1100
1101    # Generate and save an index to a specific file
1102    # @param fn        Input BAM/CRAM/etc filename
1103    # @param fnidx     Output filename, or NULL to add .bai/.csi/etc to @a fn
1104    # @param min_shift Positive to generate CSI, or 0 to generate BAI
1105    # @return  0 if successful, or negative if an error occurred.
1106    int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1107
1108    void sam_itr_destroy(hts_itr_t *iter)
1109    hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
1110    hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
1111    int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
1112
1113    #***************
1114    #*** SAM I/O ***
1115    #***************
1116
1117    htsFile *sam_open(const char *fn, const char *mode)
1118    htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
1119    int sam_close(htsFile *fp)
1120
1121    int sam_open_mode(char *mode, const char *fn, const char *format)
1122
1123    # A version of sam_open_mode that can handle ,key=value options.
1124    # The format string is allocated and returned, to be freed by the caller.
1125    # Prefix should be "r" or "w",
1126    char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
1127
1128    bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
1129    bam_hdr_t *sam_hdr_read(htsFile *fp)
1130    int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
1131
1132    int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
1133    int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1134    int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1135    int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1136
1137    #*************************************
1138    #*** Manipulating auxiliary fields ***
1139    #*************************************
1140
1141    uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
1142    int64_t  bam_aux2i(const uint8_t *s)
1143    double   bam_aux2f(const uint8_t *s)
1144    char     bam_aux2A(const uint8_t *s)
1145    char    *bam_aux2Z(const uint8_t *s)
1146
1147    void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
1148    int bam_aux_del(bam1_t *b, uint8_t *s)
1149
1150    #**************************
1151    #*** Pileup and Mpileup ***
1152    #**************************
1153
1154    #  @abstract Generic pileup 'client data'.
1155    #  @discussion The pileup iterator allows setting a constructor and
1156    #  destructor function, which will be called every time a sequence is
1157    #  fetched and discarded.  This permits caching of per-sequence data in
1158    #  a tidy manner during the pileup process.  This union is the cached
1159    #  data to be manipulated by the "client" (the caller of pileup).
1160    #
1161    union bam_pileup_cd:
1162        void *p
1163        int64_t i
1164        double f
1165
1166    # @abstract Structure for one alignment covering the pileup position.
1167    # @field  b          pointer to the alignment
1168    # @field  qpos       position of the read base at the pileup site, 0-based
1169    # @field  indel      indel length; 0 for no indel, positive for ins and negative for del
1170    # @field  level      the level of the read in the "viewer" mode
1171    # @field  is_del     1 iff the base on the padded read is a deletion
1172    # @field  is_head    ???
1173    # @field  is_tail    ???
1174    # @field  is_refskip ???
1175    # @field  aux        ???
1176    #
1177    # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
1178    # difference between the two functions is that the former does not
1179    # set bam_pileup1_t::level, while the later does. Level helps the
1180    # implementation of alignment viewers, but calculating this has some
1181    # overhead.
1182    #
1183    # is_del, is_head, etc are a bit field, declaring as below should
1184    # work as expected, see
1185    # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
1186
1187    ctypedef struct bam_pileup1_t:
1188        bam1_t *b
1189        int32_t qpos
1190        int indel, level
1191        uint32_t is_del
1192        uint32_t is_head
1193        uint32_t is_tail
1194        uint32_t is_refskip
1195        uint32_t aux
1196        bam_pileup_cd cd
1197
1198    ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
1199    ctypedef int (*bam_test_f)()
1200
1201    ctypedef struct __bam_plp_t
1202    ctypedef __bam_plp_t *bam_plp_t
1203
1204    ctypedef struct __bam_mplp_t
1205    ctypedef __bam_mplp_t *bam_mplp_t
1206
1207    # bam_plp_init() - sets an iterator over multiple
1208    # @func:      see mplp_func in bam_plcmd.c in samtools for an example. Expected return
1209    #             status: 0 on success, -1 on end, < -1 on non-recoverable errors
1210    # @data:      user data to pass to @func
1211    bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
1212    void bam_plp_destroy(bam_plp_t iter)
1213    int bam_plp_push(bam_plp_t iter, const bam1_t *b)
1214    const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1215    const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1216    void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
1217    void bam_plp_reset(bam_plp_t iter)
1218
1219    bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
1220
1221    # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
1222    # read pairs and for each base pair set the base quality of the
1223    # lower-quality base to zero, thus effectively discarding it from
1224    # calling. If the two bases are identical, the quality of the other base
1225    # is increased to the sum of their qualities (capped at 200), otherwise
1226    # it is multiplied by 0.8.
1227    void bam_mplp_init_overlaps(bam_mplp_t iter)
1228    void bam_mplp_destroy(bam_mplp_t iter)
1229    void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
1230    int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
1231    void bam_mplp_reset(bam_mplp_t iter)
1232    void bam_mplp_constructor(bam_mplp_t iter,
1233          		      int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
1234    void bam_mplp_destructor(bam_mplp_t iter,
1235			     int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
1236
1237    # Added by AH
1238    # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
1239
1240    # ***********************************
1241    # * BAQ calculation and realignment *
1242    # ***********************************/
1243    int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
1244    int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
1245
1246
1247cdef extern from "htslib/faidx.h" nogil:
1248
1249    ctypedef struct faidx_t:
1250       pass
1251
1252    # /// Build index for a FASTA or bgzip-compressed FASTA file.
1253    # /**  @param  fn  FASTA file name
1254    # @param  fnfai Name of .fai file to build.
1255    # @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
1256    # @return     0 on success; or -1 on failure
1257
1258    # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
1259    # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
1260    # file will only be built if fn is bgzip-compressed.
1261    # */
1262    int fai_build3(const char *fn,
1263                   const char *fnfai,
1264                   const char *fngzi)
1265
1266    # /// Build index for a FASTA or bgzip-compressed FASTA file.
1267    # /** @param  fn  FASTA file name
1268    # @return     0 on success; or -1 on failure
1269    #
1270    # File "fn.fai" will be generated.  This function is equivalent to
1271    # fai_build3(fn, NULL, NULL);
1272    # */
1273    int fai_build(char *fn)
1274
1275    # /// Destroy a faidx_t struct
1276    void fai_destroy(faidx_t *fai)
1277
1278    # /// Load FASTA indexes.
1279    # /** @param  fn  File name of the FASTA file (can be compressed with bgzip).
1280    #     @param  fnfai File name of the FASTA index.
1281    #     @param  fngzi File name of the bgzip index.
1282    #     @param  flags Option flags to control index file caching and creation.
1283    #     @return Pointer to a faidx_t struct on success, NULL on failure.
1284
1285    # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
1286    # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
1287    # The bgzip index is only needed if fn is compressed.
1288
1289    # If (flags & FAI_CREATE) is true, the index files will be built using
1290    # fai_build3() if they are not already present.
1291    # */
1292    faidx_t *fai_load3(const char *fn,
1293                       const char *fnfai,
1294                       const char *fngzi,
1295                       int flags)
1296
1297    # /// Load index from "fn.fai".
1298    # /** @param  fn  File name of the FASTA file
1299    #     @return Pointer to a faidx_t struct on success, NULL on failure.
1300    # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
1301    # */
1302    faidx_t *fai_load(char *fn)
1303
1304    # /// Fetch the sequence in a region
1305    # /** @param  fai  Pointer to the faidx_t struct
1306    #     @param  reg  Region in the format "chr2:20,000-30,000"
1307    #     @param  len  Length of the region; -2 if seq not present, -1 general error
1308    #     @return      Pointer to the sequence; `NULL` on failure
1309    # The returned sequence is allocated by `malloc()` family and should be destroyed
1310    # by end users by calling `free()` on it.
1311    # */
1312    char *fai_fetch(faidx_t *fai,
1313                    char *reg,
1314                    int *len)
1315
1316    # /// Fetch the sequence in a region
1317    # /** @param  fai  Pointer to the faidx_t struct
1318    #     @param  c_name Region name
1319    #     @param  p_beg_i  Beginning position number (zero-based)
1320    #     @param  p_end_i  End position number (zero-based)
1321    #     @param  len  Length of the region; -2 if c_name not present, -1 general error
1322    #     @return      Pointer to the sequence; null on failure
1323    # The returned sequence is allocated by `malloc()` family and should be destroyed
1324    # by end users by calling `free()` on it.
1325    # */
1326    char *faidx_fetch_seq(faidx_t *fai,
1327                         char *c_name,
1328                         int p_beg_i,
1329                         int p_end_i,
1330                         int *len)
1331
1332    # /// Query if sequence is present
1333    # /**   @param  fai  Pointer to the faidx_t struct
1334    #   @param  seq  Sequence name
1335    #   @return      1 if present or 0 if absent
1336    #   */
1337    int faidx_has_seq(faidx_t *fai, const char *seq)
1338
1339    # /// Fetch the number of sequences
1340    # /** @param  fai  Pointer to the faidx_t struct
1341    # @return      The number of sequences
1342    # */
1343    int faidx_nseq(const faidx_t *fai)
1344
1345    # /// Return name of i-th sequence
1346    const char *faidx_iseq(const faidx_t *fai, int i)
1347
1348    # /// Return sequence length, -1 if not present
1349    int faidx_seq_len(faidx_t *fai, const char *seq)
1350
1351# tabix support
1352cdef extern from "htslib/tbx.h" nogil:
1353
1354    # tbx.h definitions
1355    int8_t TBX_MAX_SHIFT
1356    int32_t TBX_GENERIC
1357    int32_t TBX_SAM
1358    int32_t TBX_VCF
1359    int32_t TBX_UCSC
1360
1361    ctypedef struct tbx_conf_t:
1362        int32_t preset
1363        int32_t sc, bc, ec   # seq col., beg col. and end col.
1364        int32_t meta_char, line_skip
1365
1366    ctypedef struct tbx_t:
1367        tbx_conf_t conf
1368        hts_idx_t *idx
1369        void * dict
1370
1371    tbx_conf_t tbx_conf_gff
1372    tbx_conf_t tbx_conf_bed
1373    tbx_conf_t tbx_conf_psltbl
1374    tbx_conf_t tbx_conf_sam
1375    tbx_conf_t tbx_conf_vcf
1376
1377    void tbx_itr_destroy(hts_itr_t * iter)
1378    hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
1379    hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
1380    int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
1381
1382    int tbx_name2id(tbx_t *tbx, char *ss)
1383
1384    int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
1385    int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
1386
1387    tbx_t * tbx_index_load(char *fn)
1388    tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
1389
1390    # free the array but not the values
1391    char **tbx_seqnames(tbx_t *tbx, int *n)
1392
1393    void tbx_destroy(tbx_t *tbx)
1394
1395
1396# VCF/BCF API
1397cdef extern from "htslib/vcf.h" nogil:
1398
1399    # Header struct
1400
1401    uint8_t BCF_HL_FLT   # header line
1402    uint8_t BCF_HL_INFO
1403    uint8_t BCF_HL_FMT
1404    uint8_t BCF_HL_CTG
1405    uint8_t BCF_HL_STR   # structured header line TAG=<A=..,B=..>
1406    uint8_t BCF_HL_GEN   # generic header line
1407
1408    uint8_t BCF_HT_FLAG  # header type
1409    uint8_t BCF_HT_INT
1410    uint8_t BCF_HT_REAL
1411    uint8_t BCF_HT_STR
1412
1413    uint8_t BCF_VL_FIXED # variable length
1414    uint8_t BCF_VL_VAR
1415    uint8_t BCF_VL_A
1416    uint8_t BCF_VL_G
1417    uint8_t BCF_VL_R
1418
1419    # === Dictionary ===
1420    #
1421    # The header keeps three dictionaries. The first keeps IDs in the
1422    # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
1423    # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
1424    # is the actual hash table, which is opaque to the end users. In the hash
1425    # table, the key is the ID or sample name as a C string and the value is a
1426    # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
1427    # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
1428    # size of the hash table or, equivalently, the length of the id[] arrays.
1429
1430    uint8_t BCF_DT_ID       # dictionary type
1431    uint8_t BCF_DT_CTG
1432    uint8_t BCF_DT_SAMPLE
1433
1434    # Complete textual representation of a header line
1435    ctypedef struct bcf_hrec_t:
1436        int type            # One of the BCF_HL_* type
1437        char *key           # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
1438        char *value         # Set only for generic lines, NULL for FILTER/INFO, etc.
1439        int nkeys           # Number of structured fields
1440        char **keys         # The key=value pairs
1441        char **vals
1442
1443    ctypedef struct bcf_idinfo_t:
1444        uint32_t info[3]     # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
1445        bcf_hrec_t *hrec[3]  # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
1446        int id
1447
1448    ctypedef struct bcf_idpair_t:
1449        const char *key
1450        const bcf_idinfo_t *val
1451
1452    ctypedef struct bcf_hdr_t:
1453        int32_t n[3]                # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
1454        bcf_idpair_t *id[3]
1455        void *dict[3]               # ID dictionary, contig dict and sample dict
1456        char **samples
1457        bcf_hrec_t **hrec
1458        int nhrec, dirty
1459        int ntransl
1460        int *transl[2]              # for bcf_translate()
1461        int nsamples_ori            # for bcf_hdr_set_samples()
1462        uint8_t *keep_samples
1463        kstring_t mem
1464        int32_t m[3]                # m: allocated size of the dictionary block in use (see n above)
1465
1466    uint8_t bcf_type_shift[]
1467
1468    # * VCF record *
1469
1470    uint8_t BCF_BT_NULL
1471    uint8_t BCF_BT_INT8
1472    uint8_t BCF_BT_INT16
1473    uint8_t BCF_BT_INT32
1474    uint8_t BCF_BT_FLOAT
1475    uint8_t BCF_BT_CHAR
1476
1477    uint8_t VCF_REF
1478    uint8_t VCF_SNP
1479    uint8_t VCF_MNP
1480    uint8_t VCF_INDEL
1481    uint8_t VCF_OTHER
1482
1483    ctypedef struct variant_t:
1484        int type, n     # variant type and the number of bases affected, negative for deletions
1485
1486    ctypedef struct bcf_fmt_t:
1487        int id             # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
1488        int n, size, type  # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
1489        uint8_t *p         # same as vptr and vptr_* in bcf_info_t below
1490        uint32_t p_len
1491        uint32_t p_off
1492        uint8_t p_free
1493
1494    union bcf_info_union_t:
1495        int32_t i      # integer value
1496        float f        # float value
1497
1498    ctypedef struct bcf_info_t:
1499        int key        # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
1500        int type, len  # type: one of BCF_BT_* types; len: vector length, 1 for scalars
1501
1502        # v1 union only set if $len==1; for easier access
1503        bcf_info_union_t v1
1504        uint8_t *vptr           # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
1505        uint32_t vptr_len       # length of the vptr block or, when set, of the vptr_mod block, excluding offset
1506        uint32_t vptr_off       # vptr offset, i.e., the size of the INFO key plus size+type bytes
1507        uint8_t  vptr_free      # indicates that vptr-vptr_off must be freed; set only when modified and the new
1508                                # data block is bigger than the original
1509
1510    uint8_t BCF1_DIRTY_ID
1511    uint8_t BCF1_DIRTY_ALS
1512    uint8_t BCF1_DIRTY_FLT
1513    uint8_t BCF1_DIRTY_INF
1514
1515    ctypedef struct bcf_dec_t:
1516        int m_fmt, m_info, m_id, m_als, m_allele, m_flt  # allocated size (high-water mark); do not change
1517        int n_flt           # Number of FILTER fields
1518        int *flt            # FILTER keys in the dictionary
1519        char *id            # ID
1520        char *als           # REF+ALT block (\0-seperated)
1521        char **allele       # allele[0] is the REF (allele[] pointers to the als block); all null terminated
1522        bcf_info_t *info    # INFO
1523        bcf_fmt_t *fmt      # FORMAT and individual sample
1524        variant_t *var      # $var and $var_type set only when set_variant_types called
1525        int n_var, var_type
1526        int shared_dirty    # if set, shared.s must be recreated on BCF output
1527        int indiv_dirty     # if set, indiv.s must be recreated on BCF output
1528
1529    uint8_t BCF_ERR_CTG_UNDEF
1530    uint8_t BCF_ERR_TAG_UNDEF
1531    uint8_t BCF_ERR_NCOLS
1532    uint8_t BCF_ERR_LIMITS
1533    uint8_t BCF_ERR_CHAR
1534    uint8_t BCF_ERR_CTG_INVALID
1535    uint8_t BCF_ERR_TAG_INVALID
1536
1537    # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
1538    # is slower because the string is first to be parsed, packed into BCF line
1539    # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
1540    # is known in advance that some of the fields will not be required (notably
1541    # the sample columns), parsing of these can be skipped by setting max_unpack
1542    # appropriately.
1543    # Similarly, it is fast to output a BCF line because the columns (kept in
1544    # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
1545    # line must be formatted in vcf_format.
1546
1547    ctypedef struct bcf1_t:
1548        int32_t rid               # CHROM
1549        int32_t pos               # POS
1550        int32_t rlen              # length of REF
1551        float qual                # QUAL
1552        uint32_t n_info, n_allele
1553        uint32_t n_fmt, n_sample
1554        kstring_t shared, indiv
1555        bcf_dec_t d               # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
1556        int max_unpack            # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
1557        int unpacked              # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
1558        int unpack_size[3]        # the original block size of ID, REF+ALT and FILTER
1559        int errcode               # one of BCF_ERR_* codes
1560
1561    ####### API #######
1562
1563    # BCF and VCF I/O
1564    #
1565    # A note about naming conventions: htslib internally represents VCF
1566    # records as bcf1_t data structures, therefore most functions are
1567    # prefixed with bcf_. There are a few exceptions where the functions must
1568    # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
1569    # these cases, functions prefixed with bcf_ are more general and work
1570    # with both BCF and VCF.
1571
1572    # bcf_hdr_init() - create an empty BCF header.
1573    # @param mode    "r" or "w"
1574    #
1575    # When opened for writing, the mandatory fileFormat and
1576    # FILTER=PASS lines are added automatically.
1577    bcf_hdr_t *bcf_hdr_init(const char *mode)
1578
1579    # Destroy a BCF header struct
1580    void bcf_hdr_destroy(bcf_hdr_t *h)
1581
1582    # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
1583    bcf1_t *bcf_init()
1584
1585    # Deallocate a bcf1_t object
1586    void bcf_destroy(bcf1_t *v)
1587
1588    # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
1589    # not the bcf1_t object itself.
1590    void bcf_empty(bcf1_t *v)
1591
1592    # Make the bcf1_t object ready for next read. Intended mostly for
1593    # internal use, the user should rarely need to call this function
1594    # directly.
1595    void bcf_clear(bcf1_t *v)
1596
1597    # Reads VCF or BCF header
1598    bcf_hdr_t *bcf_hdr_read(htsFile *fp)
1599
1600    # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
1601    # @samples: samples to include or exclude from file or as a comma-separated string.
1602    #             LIST|FILE   .. select samples in list/file
1603    #             ^LIST|FILE  .. exclude samples from list/file
1604    #             -           .. include all samples
1605    #             NULL        .. exclude all samples
1606    # @is_file: @samples is a file (1) or a comma-separated list (0)
1607    #
1608    # The bottleneck of VCF reading is parsing of genotype fields. If the
1609    # reader knows in advance that only subset of samples is needed (possibly
1610    # no samples at all), the performance of bcf_read() can be significantly
1611    # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
1612    # The function bcf_read() will subset the VCF/BCF records automatically
1613    # with the notable exception when reading records via bcf_itr_next().
1614    # In this case, bcf_subset_format() must be called explicitly, because
1615    # bcf_readrec() does not see the header.
1616    #
1617    # Returns 0 on success, -1 on error or a positive integer if the list
1618    # contains samples not present in the VCF header. In such a case, the
1619    # return value is the index of the offending sample.
1620    #
1621    int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
1622    int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1623
1624    # Writes VCF or BCF header
1625    int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
1626
1627    # Parse VCF line contained in kstring and populate the bcf1_t struct
1628    int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
1629
1630    # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
1631    int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
1632
1633    # bcf_read() - read next VCF or BCF record
1634    #
1635    # Returns -1 on critical errors, 0 otherwise. On errors which are not
1636    # critical for reading, such as missing header definitions, v->errcode is
1637    # set to one of BCF_ERR* code and must be checked before calling
1638    # vcf_write().
1639    int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1640
1641    # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
1642    #
1643    # Note that bcf_unpack() must be called even when reading VCF. It is safe
1644    # to call the function repeatedly, it will not unpack the same field
1645    # twice.
1646    uint8_t BCF_UN_STR        # up to ALT inclusive
1647    uint8_t BCF_UN_FLT        # up to FILTER
1648    uint8_t BCF_UN_INFO       # up to INFO
1649    uint8_t BCF_UN_SHR        # all shared information
1650    uint8_t BCF_UN_FMT        # unpack format and each sample
1651    uint8_t BCF_UN_IND        # a synonymo of BCF_UN_FMT
1652    uint8_t BCF_UN_ALL        # everything
1653
1654    int bcf_unpack(bcf1_t *b, int which)
1655
1656    # bcf_dup() - create a copy of BCF record.
1657    #
1658    # Note that bcf_unpack() must be called on the returned copy as if it was
1659    # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
1660    # internally to reflect any changes made by bcf_update_* functions.
1661    bcf1_t *bcf_dup(bcf1_t *src)
1662    bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
1663
1664    # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
1665    int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
1666
1667    # The following functions work only with VCFs and should rarely be called
1668    # directly. Usually one wants to use their bcf_* alternatives, which work
1669    # transparently with both VCFs and BCFs.
1670    bcf_hdr_t *vcf_hdr_read(htsFile *fp)
1671    int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
1672    int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1673    int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1674
1675    #************************************************************************
1676    # Header querying and manipulation routines
1677    #************************************************************************
1678
1679    # Create a new header using the supplied template
1680    bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
1681
1682    # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
1683    # Returns 0 on success or sets a bit on error:
1684    #     1 .. conflicting definitions of tag length
1685    #     # todo
1686    int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
1687
1688    # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
1689    # @param dst: the destination header to be merged into, NULL on the first pass
1690    # @param src: the source header
1691    #
1692    # Notes:
1693    #     - use as:
1694    #         bcf_hdr_t *dst = NULL;
1695    #         for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
1696    #
1697    #     - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
1698    #     combining multiple BCF headers. The current bcf_hdr_combine()
1699    #     does not have this problem, but became slow when used for many files.
1700    bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
1701
1702    # bcf_hdr_add_sample() - add a new sample.
1703    # @param sample:  sample name to be added
1704    int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
1705
1706    # Read VCF header from a file and update the header
1707    int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1708
1709    # Appends formatted header text to _str_.
1710    # If _is_bcf_ is zero, `IDX` fields are discarded.
1711    #  @return 0 if successful, or negative if an error occurred
1712    #  @since 1.4
1713    int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
1714
1715    # Returns formatted header (newly allocated string) and its length,
1716    # excluding the terminating \0. If is_bcf parameter is unset, IDX
1717    # fields are discarded.
1718    char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
1719
1720    # Append new VCF header line, returns 0 on success
1721    int bcf_hdr_append(bcf_hdr_t *h, const char *line)
1722    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
1723
1724    # VCF version, e.g. VCFv4.2
1725    const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1726    void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1727
1728    # bcf_hdr_remove() - remove VCF header tag
1729    # @param type:      one of BCF_HL_*
1730    # @param key:       tag name or NULL to remove all tags of the given type
1731    void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
1732
1733    # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
1734    # @param n:        number of samples to keep
1735    # @param samples:  names of the samples to keep
1736    # @param imap:     mapping from index in @samples to the sample index in the original file
1737    #
1738    # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
1739    # by comparing n and bcf_hdr_nsamples(out_hdr).
1740    # This function can be used to reorder samples.
1741    # See also bcf_subset() which subsets individual records.
1742    #
1743    bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
1744
1745    # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
1746    const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
1747
1748    # Get number of samples
1749    int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
1750
1751    # The following functions are for internal use and should rarely be called directly
1752    int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1753    int bcf_hdr_sync(bcf_hdr_t *h)
1754    bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
1755    void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1756    int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1757
1758    # bcf_hdr_get_hrec() - get header line info
1759    # @param type:  one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
1760    # @param key:   the header key for generic lines (e.g. "fileformat"), any field
1761    #                 for structured lines, typically "ID".
1762    # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
1763    # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
1764    #
1765    bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1766    bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
1767    void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
1768    void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
1769    int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
1770    void hrec_add_idx(bcf_hrec_t *hrec, int idx)
1771    void bcf_hrec_destroy(bcf_hrec_t *hrec)
1772
1773    #************************************************************************
1774    # Individual record querying and manipulation routines
1775    #************************************************************************
1776
1777    # See the description of bcf_hdr_subset()
1778    int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
1779
1780    # bcf_translate() - translate tags ids to be consistent with different header. This function
1781    #                   is useful when lines from multiple VCF need to be combined.
1782    # @dst_hdr:   the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
1783    # @src_hdr:   the source header, used in bcf_read()
1784    # @src_line:  line obtained by bcf_read()
1785    int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
1786
1787    # bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc
1788    int bcf_get_variant_types(bcf1_t *rec)
1789    int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
1790    int bcf_is_snp(bcf1_t *v)
1791
1792    # bcf_update_filter() - sets the FILTER column
1793    # @flt_ids:  The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
1794    # @n:        Number of filters. If n==0, all filters are removed
1795    int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
1796
1797    # bcf_add_filter() - adds to the FILTER column
1798    # @flt_id:   The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
1799    #
1800    # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
1801    int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
1802
1803    # bcf_remove_filter() - removes from the FILTER column
1804    # @flt_id:   filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
1805    # @pass:     when set to 1 and no filters are present, set to PASS
1806    int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
1807
1808    # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
1809    int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
1810
1811    # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
1812    # @alleles:           Array of alleles
1813    # @nals:              Number of alleles
1814    # @alleles_string:    Comma-separated alleles, starting with the REF allele
1815    int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
1816    int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
1817
1818    # bcf_update_id() - sets new ID string
1819    # bcf_add_id() - adds to the ID string checking for duplicates
1820    int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
1821    int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
1822
1823    # bcf_update_info_*() - functions for updating INFO fields
1824    # @hdr:       the BCF header
1825    # @line:      VCF line to be edited
1826    # @key:       the INFO tag to be updated
1827    # @values:    pointer to the array of values. Pass NULL to remove the tag.
1828    # @n:         number of values in the array. When set to 0, the INFO tag is removed
1829    #
1830    # The @string in bcf_update_info_flag() is optional, @n indicates whether
1831    # the flag is set or removed.
1832    #
1833    # Returns 0 on success or negative value on error.
1834    #
1835    int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
1836    int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
1837    int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
1838    int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
1839    int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
1840
1841    # bcf_update_format_*() - functions for updating FORMAT fields
1842    # @values:    pointer to the array of values, the same number of elements
1843    #             is expected for each sample. Missing values must be padded
1844    #             with bcf_*_missing or bcf_*_vector_end values.
1845    # @n:         number of values in the array. If n==0, existing tag is removed.
1846    #
1847    # The function bcf_update_format_string() is a higher-level (slower) variant of
1848    # bcf_update_format_char(). The former accepts array of \0-terminated strings
1849    # whereas the latter requires that the strings are collapsed into a single array
1850    # of fixed-length strings. In case of strings with variable length, shorter strings
1851    # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
1852    # are not \0-terminated.
1853    #
1854    # Returns 0 on success or negative value on error.
1855    #
1856    int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
1857    int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
1858    int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
1859    int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
1860    int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
1861    int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
1862
1863    # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
1864    # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
1865    # from bcf_get_genotypes() below.
1866    uint32_t bcf_gt_phased(uint32_t idx)
1867    uint32_t bcf_gt_unphased(uint32_t idx)
1868    uint32_t bcf_gt_missing
1869    uint32_t bcf_gt_is_missing(uint32_t val)
1870    uint32_t bcf_gt_is_phased(uint32_t idx)
1871    uint32_t bcf_gt_allele(uint32_t val)
1872
1873    # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
1874    uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
1875    void bcf_gt2alleles(int igt, int *a, int *b)
1876
1877    # bcf_get_fmt() - returns pointer to FORMAT's field data
1878    # @header: for access to BCF_DT_ID dictionary
1879    # @line:   VCF line obtained from vcf_parse1
1880    # @fmt:    one of GT,PL,...
1881    #
1882    # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
1883    # is not available.
1884    #
1885    bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
1886    bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
1887
1888    # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
1889    # @line: VCF line obtained from vcf_parse1
1890    # @id:  The header index for the tag, obtained from bcf_hdr_id2int()
1891    #
1892    # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
1893    # as their goal is to avoid the header lookup.
1894    #
1895    bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
1896    bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
1897
1898    # bcf_get_info_*() - get INFO values, integers or floats
1899    # @hdr:       BCF header
1900    # @line:      BCF record
1901    # @tag:       INFO tag to retrieve
1902    # @dst:       *dst is pointer to a memory location, can point to NULL
1903    # @ndst:      pointer to the size of allocated memory
1904    #
1905    # Returns negative value on error or the number of written values on
1906    # success. bcf_get_info_string() returns on success the number of
1907    # characters written excluding the null-terminating byte. bcf_get_info_flag()
1908    # returns 1 when flag is set or 0 if not.
1909    #
1910    # List of return codes:
1911    #     -1 .. no such INFO tag defined in the header
1912    #     -2 .. clash between types defined in the header and encountered in the VCF record
1913    #     -3 .. tag is not present in the VCF record
1914    #
1915    int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
1916    int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
1917    int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
1918    int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
1919    int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
1920
1921    # bcf_get_format_*() - same as bcf_get_info*() above
1922    #
1923    # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
1924    # see the description of bcf_update_format_string() and bcf_update_format_char() above.
1925    # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
1926    # a single block of \0-terminated strings collapsed into a single array and an array of pointers
1927    # to these strings. Both arrays must be cleaned by the user.
1928    #
1929    # Returns negative value on error or the number of written values on success.
1930    #
1931    # Example:
1932    #     int ndst = 0; char **dst = NULL
1933    #     if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
1934    #         for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
1935    #     free(dst[0]); free(dst)
1936    #
1937    # Example:
1938    #     int ngt, *gt_arr = NULL, ngt_arr = 0
1939    #     ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr)
1940    #
1941    int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
1942    int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
1943    int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
1944    int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst)
1945    int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
1946    int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
1947
1948    #************************************************************************
1949    # Helper functions
1950    #************************************************************************
1951
1952    #
1953    # bcf_hdr_id2int() - Translates string into numeric ID
1954    # bcf_hdr_int2id() - Translates numeric ID into string
1955    # @type:     one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
1956    # @id:       tag name, such as: PL, DP, GT, etc.
1957    #
1958    # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
1959    # fields in BCF records.
1960    #
1961    int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
1962    const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
1963
1964    # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
1965    # bcf_hdr_id2name() - Translates numeric ID to sequence name
1966    #
1967    int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
1968    const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
1969    const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
1970
1971    #
1972    # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
1973    # @type:      one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
1974    # @int_id:    return value of bcf_hdr_id2int, must be >=0
1975    #
1976    # The returned values are:
1977    #    bcf_hdr_id2length   ..  whether the number of values is fixed or variable, one of BCF_VL_*
1978    #    bcf_hdr_id2number   ..  the number of values, 0xfffff for variable length fields
1979    #    bcf_hdr_id2type     ..  the field type, one of BCF_HT_*
1980    #    bcf_hdr_id2coltype  ..  the column type, one of BCF_HL_*
1981    #
1982    # Notes: Prior to using the macros, the presence of the info should be
1983    # tested with bcf_hdr_idinfo_exists().
1984    #
1985    int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
1986    int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
1987    int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
1988    int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
1989    int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
1990    bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
1991
1992    void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
1993    uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
1994
1995    void bcf_enc_vchar(kstring_t *s, int l, const char *a)
1996    void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
1997    void bcf_enc_vfloat(kstring_t *s, int n, float *a)
1998
1999    #************************************************************************
2000    # BCF index
2001    #
2002    # Note that these functions work with BCFs only. See synced_bcf_reader.h
2003    # which provides (amongst other things) an API to work transparently with
2004    # both indexed BCFs and VCFs.
2005    #************************************************************************
2006
2007    hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
2008    int bcf_index_build(const char *fn, int min_shift)
2009    int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
2010
2011    #*******************
2012    # Typed value I/O *
2013    #******************
2014
2015    # Note that in contrast with BCFv2.1 specification, HTSlib implementation
2016    # allows missing values in vectors. For integer types, the values 0x80,
2017    # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
2018    # 0x80000001 as end-of-vector indicators.  Similarly for floats, the value of
2019    # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
2020    # end-of-vector indicator.
2021    # Note that the end-of-vector byte is not part of the vector.
2022
2023    # This trial BCF version (v2.2) is compatible with the VCF specification and
2024    # enables to handle correctly vectors with different ploidy in presence of
2025    # missing values.
2026
2027    int32_t bcf_int8_vector_end
2028    int32_t bcf_int16_vector_end
2029    int32_t bcf_int32_vector_end
2030    int32_t bcf_str_vector_end
2031    int32_t bcf_int8_missing
2032    int32_t bcf_int16_missing
2033    int32_t bcf_int32_missing
2034    int32_t bcf_str_missing
2035
2036    uint32_t bcf_float_vector_end
2037    uint32_t bcf_float_missing
2038
2039    void bcf_float_set(float *ptr, uint32_t value)
2040    void bcf_float_set_vector_end(float *x)
2041    void bcf_float_set_missing(float *x)
2042
2043    int bcf_float_is_missing(float f)
2044    int bcf_float_is_vector_end(float f)
2045    void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
2046    void bcf_enc_size(kstring_t *s, int size, int type)
2047    int bcf_enc_inttype(long x)
2048    void bcf_enc_int1(kstring_t *s, int32_t x)
2049    int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
2050    int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
2051    int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
2052
2053    # These trivial wrappers are defined only for consistency with other parts of htslib
2054    bcf1_t *bcf_init1()
2055    int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2056    int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2057    int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2058    int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2059    void bcf_destroy1(bcf1_t *v)
2060    void bcf_empty1(bcf1_t *v)
2061    int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2062    void bcf_clear1(bcf1_t *v)
2063    int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
2064
2065    # Other nice wrappers
2066    void bcf_itr_destroy(hts_itr_t *iter)
2067    hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
2068    hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
2069    int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
2070    hts_idx_t *bcf_index_load(const char *fn)
2071    const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
2072
2073
2074# VCF/BCF utility functions
2075cdef extern from "htslib/vcfutils.h" nogil:
2076    struct kbitset_t
2077
2078    # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
2079    # @header:  for access to BCF_DT_ID dictionary
2080    # @line:    VCF line obtain from vcf_parse1
2081    #
2082    # Returns the number of removed alleles on success or negative
2083    # on error:
2084    #     -1 .. some allele index is out of bounds
2085    int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
2086
2087    # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
2088    # @header:  for access to BCF_DT_ID dictionary
2089    # @line:    VCF line obtained from vcf_parse1
2090    # @mask:    alleles to remove
2091    #
2092    # If you have more than 31 alleles, then the integer bit mask will
2093    # overflow, so use bcf_remove_allele_set instead
2094    void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
2095
2096    # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
2097    # @header:  for access to BCF_DT_ID dictionary
2098    # @line:    VCF line obtained from vcf_parse1
2099    # @rm_set:  pointer to kbitset_t object with bits set for allele
2100    #           indexes to remove
2101    #
2102    # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
2103    void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
2104
2105    # bcf_calc_ac() - calculate the number of REF and ALT alleles
2106    # @header:  for access to BCF_DT_ID dictionary
2107    # @line:    VCF line obtained from vcf_parse1
2108    # @ac:      array of length line->n_allele
2109    # @which:   determine if INFO/AN,AC and indv fields be used
2110    #
2111    # Returns 1 if the call succeeded, or 0 if the value could not
2112    # be determined.
2113    #
2114    # The value of @which determines if existing INFO/AC,AN can be
2115    # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT).
2116    int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
2117
2118    # bcf_gt_type() - determines type of the genotype
2119    # @fmt_ptr:  the GT format field as set for example by set_fmt_ptr
2120    # @isample:  sample index (starting from 0)
2121    # @ial:      index of the 1st non-reference allele (starting from 1)
2122    # @jal:      index of the 2nd non-reference allele (starting from 1)
2123    #
2124    # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
2125    # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
2126    # is not NULL and the genotype has one or more non-reference
2127    # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
2128    # position of the allele which appeared first in ALT. If $jal is
2129    # not null and the genotype is GT_HET_AA, $jal will be set and is
2130    # the position of the second allele in ALT.
2131    uint8_t GT_HOM_RR    # note: the actual value of GT_* matters, used in dosage r2 calculation
2132    uint8_t GT_HOM_AA
2133    uint8_t GT_HET_RA
2134    uint8_t GT_HET_AA
2135    uint8_t GT_HAPL_R
2136    uint8_t GT_HAPL_A
2137    uint8_t GT_UNKN
2138    int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
2139
2140    int bcf_acgt2int(char c)
2141    char bcf_int2acgt(int i)
2142
2143    # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
2144    # @i,j:  allele indexes, 0-based, i<=j
2145    # Returns index to the Number=G diploid array
2146    uint32_t bcf_ij2G(uint32_t i, uint32_t j)
2147
2148
2149cdef extern from "htslib/cram.h" nogil:
2150
2151    enum cram_block_method:
2152        ERROR
2153        RAW
2154        GZIP
2155        BZIP2
2156        LZMA
2157        RANS
2158        RANS0
2159        RANS1
2160        GZIP_RLE
2161
2162    enum cram_content_type:
2163        CT_ERROR
2164        FILE_HEADER
2165        COMPRESSION_HEADER
2166        MAPPED_SLICE
2167        UNMAPPED_SLICE
2168        EXTERNAL
2169        CORE
2170
2171    # Opaque data types, see cram_structs for the fully fledged versions.
2172    ctypedef struct SAM_hdr
2173    ctypedef struct cram_file_def
2174    ctypedef struct cram_fd
2175    ctypedef struct cram_container
2176    ctypedef struct cram_block
2177    ctypedef struct cram_slice
2178    ctypedef struct cram_metrics
2179    ctypedef struct cram_block_slice_hdr
2180    ctypedef struct cram_block_compression_hdr
2181    ctypedef struct refs_t
2182
2183    # Accessor functions
2184
2185    #
2186    #-----------------------------------------------------------------------------
2187    # cram_fd
2188    #
2189    SAM_hdr *cram_fd_get_header(cram_fd *fd)
2190    void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr)
2191
2192    int cram_fd_get_version(cram_fd *fd)
2193    void cram_fd_set_version(cram_fd *fd, int vers)
2194
2195    int cram_major_vers(cram_fd *fd)
2196    int cram_minor_vers(cram_fd *fd)
2197
2198    hFILE *cram_fd_get_fp(cram_fd *fd)
2199    void cram_fd_set_fp(cram_fd *fd, hFILE *fp)
2200
2201    #
2202    #-----------------------------------------------------------------------------
2203    # cram_container
2204    #
2205    int32_t cram_container_get_length(cram_container *c)
2206    void cram_container_set_length(cram_container *c, int32_t length)
2207    int32_t cram_container_get_num_blocks(cram_container *c)
2208    void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks)
2209    int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
2210    void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
2211				      int32_t *landmarks)
2212
2213    # Returns true if the container is empty (EOF marker) */
2214    int cram_container_is_empty(cram_fd *fd)
2215
2216
2217    #
2218    #-----------------------------------------------------------------------------
2219    # cram_block
2220    #
2221    int32_t cram_block_get_content_id(cram_block *b)
2222    int32_t cram_block_get_comp_size(cram_block *b)
2223    int32_t cram_block_get_uncomp_size(cram_block *b)
2224    int32_t cram_block_get_crc32(cram_block *b)
2225    void *  cram_block_get_data(cram_block *b)
2226
2227    cram_content_type cram_block_get_content_type(cram_block *b)
2228
2229    void cram_block_set_content_id(cram_block *b, int32_t id)
2230    void cram_block_set_comp_size(cram_block *b, int32_t size)
2231    void cram_block_set_uncomp_size(cram_block *b, int32_t size)
2232    void cram_block_set_crc32(cram_block *b, int32_t crc)
2233    void cram_block_set_data(cram_block *b, void *data)
2234
2235    int cram_block_append(cram_block *b, void *data, int size)
2236    void cram_block_update_size(cram_block *b)
2237
2238    # Offset is known as "size" internally, but it can be confusing.
2239    size_t cram_block_get_offset(cram_block *b)
2240    void cram_block_set_offset(cram_block *b, size_t offset)
2241
2242    #
2243    # Computes the size of a cram block, including the block
2244    # header itself.
2245    #
2246    uint32_t cram_block_size(cram_block *b)
2247
2248    #
2249    # Renumbers RG numbers in a cram compression header.
2250    #
2251    # CRAM stores RG as the Nth number in the header, rather than a
2252    # string holding the ID: tag.  This is smaller in space, but means
2253    # "samtools cat" to join files together that contain single but
2254    # different RG lines needs a way of renumbering them.
2255    #
2256    # The file descriptor is expected to be immediately after the
2257    # cram_container structure (ie before the cram compression header).
2258    # Due to the nature of the CRAM format, this needs to read and write
2259    # the blocks itself.  Note that there may be multiple slices within
2260    # the container, meaning multiple compression headers to manipulate.
2261    # Changing RG may change the size of the compression header and
2262    # therefore the length field in the container.  Hence we rewrite all
2263    # blocks just in case and also emit the adjusted container.
2264    #
2265    # The current implementation can only cope with renumbering a single
2266    # RG (and only then if it is using HUFFMAN or BETA codecs).  In
2267    # theory it *may* be possible to renumber multiple RGs if they use
2268    # HUFFMAN to the CORE block or use an external block unshared by any
2269    # other data series.  So we have an API that can be upgraded to
2270    # support this, but do not implement it for now.  An example
2271    # implementation of RG as an EXTERNAL block would be to find that
2272    # block and rewrite it, returning the number of blocks consumed.
2273    #
2274    # Returns 0 on success;
2275    #        -1 if unable to edit;
2276    #        -2 on other errors (eg I/O).
2277    #
2278    int cram_transcode_rg(cram_fd *input, cram_fd *output,
2279    			  cram_container *c,
2280			  int nrg, int *in_rg, int *out_rg)
2281
2282    #
2283    # Copies the blocks representing the next num_slice slices from a
2284    # container from 'in' to 'out'.  It is expected that the file pointer
2285    # is just after the read of the cram_container and cram compression
2286    # header.
2287    #
2288    # Returns 0 on success
2289    #        -1 on failure
2290    #
2291    int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice)
2292
2293    #
2294    #-----------------------------------------------------------------------------
2295    # SAM_hdr
2296    #
2297
2298    # Tokenises a SAM header into a hash table.
2299    #
2300    # Also extracts a few bits on specific data types, such as @RG lines.
2301    #
2302    # @return
2303    # Returns a SAM_hdr struct on success (free with sam_hdr_free())
2304    #         NULL on failure
2305    #
2306    SAM_hdr *sam_hdr_parse_(const char *hdr, int len)
2307
2308
2309    #
2310    #-----------------------------------------------------------------------------
2311    # cram_io basics
2312    #
2313
2314    # CRAM blocks - the dynamically growable data block. We have code to
2315    # create, update, (un)compress and read/write.
2316    #
2317    # These are derived from the deflate_interlaced.c blocks, but with the
2318    # CRAM extension of content types and IDs.
2319    #
2320
2321    # Allocates a new cram_block structure with a specified content_type and
2322    # id.
2323    #
2324    # @return
2325    # Returns block pointer on success;
2326    #         NULL on failure
2327    #
2328    cram_block *cram_new_block(cram_content_type content_type,
2329			       int content_id)
2330
2331    # Reads a block from a cram file.
2332    #
2333    # @return
2334    # Returns cram_block pointer on success;
2335    #         NULL on failure
2336    #
2337    cram_block *cram_read_block(cram_fd *fd)
2338
2339    # Writes a CRAM block.
2340    #
2341    # @return
2342    # Returns 0 on success;
2343    #        -1 on failure
2344    #
2345    int cram_write_block(cram_fd *fd, cram_block *b)
2346
2347    # Frees a CRAM block, deallocating internal data too.
2348    #
2349    void cram_free_block(cram_block *b)
2350
2351    # Uncompresses a CRAM block, if compressed.
2352    #
2353    # @return
2354    # Returns 0 on success;
2355    #        -1 on failure
2356    #
2357    int cram_uncompress_block(cram_block *b)
2358
2359    # Compresses a block.
2360    #
2361    # Compresses a block using one of two different zlib strategies. If we only
2362    # want one choice set strat2 to be -1.
2363    #
2364    # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
2365    # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
2366    # significantly faster.
2367    #
2368    # @return
2369    # Returns 0 on success;
2370    #        -1 on failure
2371    #
2372    int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2373			    int method, int level)
2374
2375    # Containers
2376    #
2377
2378    # Creates a new container, specifying the maximum number of slices
2379    # and records permitted.
2380    #
2381    # @return
2382    # Returns cram_container ptr on success;
2383    #         NULL on failure
2384    #
2385    cram_container *cram_new_container(int nrec, int nslice)
2386    void cram_free_container(cram_container *c)
2387
2388    # Reads a container header.
2389    #
2390    # @return
2391    # Returns cram_container on success;
2392    #         NULL on failure or no container left (fd->err == 0).
2393    #
2394    cram_container *cram_read_container(cram_fd *fd)
2395
2396    # Writes a container structure.
2397    #
2398    # @return
2399    # Returns 0 on success;
2400    #        -1 on failure
2401    #
2402    int cram_write_container(cram_fd *fd, cram_container *h)
2403
2404    #
2405    # Stores the container structure in dat and returns *size as the
2406    # number of bytes written to dat[].  The input size of dat is also
2407    # held in *size and should be initialised to cram_container_size(c).
2408    #
2409    # Returns 0 on success;
2410    #        -1 on failure
2411    #
2412    int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2413
2414    int cram_container_size(cram_container *c)
2415
2416    # The top-level cram opening, closing and option handling
2417    #
2418
2419    # Opens a CRAM file for read (mode "rb") or write ("wb").
2420    #
2421    # The filename may be "-" to indicate stdin or stdout.
2422    #
2423    # @return
2424    # Returns file handle on success;
2425    #         NULL on failure.
2426    #
2427    cram_fd *cram_open(const char *filename, const char *mode)
2428
2429    # Opens an existing stream for reading or writing.
2430    #
2431    # @return
2432    # Returns file handle on success;
2433    #         NULL on failure.
2434    #
2435    cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode)
2436
2437    # Closes a CRAM file.
2438    #
2439    # @return
2440    # Returns 0 on success;
2441    #        -1 on failure
2442    #
2443    int cram_close(cram_fd *fd)
2444
2445    #
2446    # Seek within a CRAM file.
2447    #
2448    # Returns 0 on success
2449    #        -1 on failure
2450    #
2451    int cram_seek(cram_fd *fd, off_t offset, int whence)
2452
2453    #
2454    # Flushes a CRAM file.
2455    # Useful for when writing to stdout without wishing to close the stream.
2456    #
2457    # Returns 0 on success
2458    #        -1 on failure
2459    #
2460    int cram_flush(cram_fd *fd)
2461
2462    # Checks for end of file on a cram_fd stream.
2463    #
2464    # @return
2465    # Returns 0 if not at end of file
2466    #         1 if we hit an expected EOF (end of range or EOF block)
2467    #         2 for other EOF (end of stream without EOF block)
2468    #
2469    int cram_eof(cram_fd *fd)
2470
2471    # Sets options on the cram_fd.
2472    #
2473    # See CRAM_OPT_* definitions in hts.h.
2474    # Use this immediately after opening.
2475    #
2476    # @return
2477    # Returns 0 on success;
2478    #        -1 on failure
2479    #
2480    int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...)
2481
2482    # Sets options on the cram_fd.
2483    #
2484    # See CRAM_OPT_* definitions in hts.h.
2485    # Use this immediately after opening.
2486    #
2487    # @return
2488    # Returns 0 on success;
2489    #        -1 on failure
2490    #
2491    int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args)
2492
2493    #
2494    # Attaches a header to a cram_fd.
2495    #
2496    # This should be used when creating a new cram_fd for writing where
2497    # we have an SAM_hdr already constructed (eg from a file we've read
2498    # in).
2499    #
2500    # @return
2501    # Returns 0 on success;
2502    #        -1 on failure
2503    #
2504    int cram_set_header(cram_fd *fd, SAM_hdr *hdr)
2505
2506    # Check if this file has a proper EOF block
2507    #
2508    # @return
2509    # Returns 3 if the file is a version of CRAM that does not contain EOF blocks
2510    #         2 if the file is a stream and thus unseekable
2511    #         1 if the file contains an EOF block
2512    #         0 if the file does not contain an EOF block
2513    #        -1 if an error occurred whilst reading the file or we could not seek back to where we were
2514    #
2515    #
2516    int cram_check_EOF(cram_fd *fd)
2517
2518    # As int32_decoded/encode, but from/to blocks instead of cram_fd */
2519    int int32_put_blk(cram_block *b, int32_t val)
2520
2521    # Deallocates all storage used by a SAM_hdr struct.
2522    #
2523    # This also decrements the header reference count. If after decrementing
2524    # it is still non-zero then the header is assumed to be in use by another
2525    # caller and the free is not done.
2526    #
2527    # This is a synonym for sam_hdr_dec_ref().
2528    #
2529    void sam_hdr_free(SAM_hdr *hdr)
2530
2531    # Returns the current length of the SAM_hdr in text form.
2532    #
2533    # Call sam_hdr_rebuild() first if editing has taken place.
2534    #
2535    int sam_hdr_length(SAM_hdr *hdr)
2536
2537    # Returns the string form of the SAM_hdr.
2538    #
2539    # Call sam_hdr_rebuild() first if editing has taken place.
2540    #
2541    char *sam_hdr_str(SAM_hdr *hdr)
2542
2543    # Appends a formatted line to an existing SAM header.
2544    #
2545    # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
2546    # optional new-line. If it contains more than 1 line then multiple lines
2547    # will be added in order.
2548    #
2549    # Len is the length of the text data, or 0 if unknown (in which case
2550    # it should be null terminated).
2551    #
2552    # @return
2553    # Returns 0 on success;
2554    #        -1 on failure
2555    #
2556
2557    # Add an @PG line.
2558    #
2559    # If we wish complete control over this use sam_hdr_add() directly. This
2560    # function uses that, but attempts to do a lot of tedious house work for
2561    # you too.
2562    #
2563    # - It will generate a suitable ID if the supplied one clashes.
2564    # - It will generate multiple @PG records if we have multiple PG chains.
2565    #
2566    # Call it as per sam_hdr_add() with a series of key,value pairs ending
2567    # in NULL.
2568    #
2569    # @return
2570    # Returns 0 on success;
2571    #        -1 on failure
2572    #
2573    int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...)
2574
2575    #
2576    # A function to help with construction of CL tags in @PG records.
2577    # Takes an argc, argv pair and returns a single space-separated string.
2578    # This string should be deallocated by the calling function.
2579    #
2580    # @return
2581    # Returns malloced char * on success;
2582    #         NULL on failure
2583    #
2584    char *stringify_argv(int argc, char *argv[])
2585
2586    #
2587    # Returns the refs_t structure used by a cram file handle.
2588    #
2589    # This may be used in conjunction with option CRAM_OPT_SHARED_REF to
2590    # share reference memory between multiple file handles.
2591    #
2592    # @return
2593    # Returns NULL if none exists or the file handle is not a CRAM file.
2594    #
2595    refs_t *cram_get_refs(htsFile *fd)
2596
2597
2598cdef class HTSFile(object):
2599    cdef          htsFile *htsfile       # pointer to htsFile structure
2600    cdef          int64_t start_offset   # BGZF offset of first record
2601
2602    cdef readonly object  filename       # filename as supplied by user
2603    cdef readonly object  mode           # file opening mode
2604    cdef readonly object  threads        # number of threads to use
2605    cdef readonly object  index_filename # filename of index, if supplied by user
2606
2607    cdef readonly bint    is_stream      # Is htsfile a non-seekable stream
2608    cdef readonly bint    is_remote      # Is htsfile a remote stream
2609    cdef readonly bint	  duplicate_filehandle   # Duplicate filehandle when opening via fh
2610
2611    cdef htsFile *_open_htsfile(self) except? NULL
2612