1from libc.stdint cimport int8_t, int16_t, int32_t, int64_t 2from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t 3from libc.stdlib cimport malloc, calloc, realloc, free 4from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup 5from libc.stdio cimport FILE, printf 6from posix.types cimport off_t 7 8cdef extern from "Python.h": 9 FILE* PyFile_AsFile(object) 10 11 12# cython does not wrap stdarg 13cdef extern from "stdarg.h": 14 ctypedef struct va_list: 15 pass 16 17 18cdef extern from "htslib/kstring.h" nogil: 19 ctypedef struct kstring_t: 20 size_t l, m 21 char *s 22 23 int kputc(int c, kstring_t *s) 24 int kputw(int c, kstring_t *s) 25 int kputl(long c, kstring_t *s) 26 int ksprintf(kstring_t *s, const char *fmt, ...) 27 28 29cdef extern from "htslib_util.h" nogil: 30 int hts_set_verbosity(int verbosity) 31 int hts_get_verbosity() 32 33 ctypedef uint32_t khint32_t 34 ctypedef uint32_t khint_t 35 ctypedef khint_t khiter_t 36 37 # Used to manage BCF Header info 38 ctypedef struct vdict_t: 39 khint_t n_buckets, size, n_occupied, upper_bound 40 khint32_t *flags 41 const char *keys 42 bcf_idinfo_t *vals 43 44 # Used to manage indexed contigs in Tabix 45 ctypedef struct s2i_t: 46 khint_t n_buckets, size, n_occupied, upper_bound 47 khint32_t *flags 48 const char *keys 49 int64_t *vals 50 51 # Generic khash methods 52 khint_t kh_size(void *d) 53 khint_t kh_begin(void *d) 54 khint_t kh_end(void *d) 55 int kh_exist(void *d, khiter_t i) 56 57 # Specialized khash methods for vdict 58 khint_t kh_get_vdict(vdict_t *d, const char *key) 59 const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) 60 bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) 61 62 63cdef extern from "htslib/hfile.h" nogil: 64 ctypedef struct hFILE 65 66 # @abstract Open the named file or URL as a stream 67 # @return An hFILE pointer, or NULL (with errno set) if an error occurred. 68 hFILE *hopen(const char *filename, const char *mode, ...) 69 70 # @abstract Associate a stream with an existing open file descriptor 71 # @return An hFILE pointer, or NULL (with errno set) if an error occurred. 72 # @notes For socket descriptors (on Windows), mode should contain 's'. 73 hFILE *hdopen(int fd, const char *mode) 74 75 # @abstract Report whether the file name or URL denotes remote storage 76 # @return 0 if local, 1 if remote. 77 # @notes "Remote" means involving e.g. explicit network access, with the 78 # implication that callers may wish to cache such files' contents locally. 79 int hisremote(const char *filename) 80 81 # @abstract Flush (for output streams) and close the stream 82 # @return 0 if successful, or EOF (with errno set) if an error occurred. 83 int hclose(hFILE *fp) 84 85 # @abstract Close the stream, without flushing or propagating errors 86 # @notes For use while cleaning up after an error only. Preserves errno. 87 void hclose_abruptly(hFILE *fp) 88 89 # @abstract Return the stream's error indicator 90 # @return Non-zero (in fact, an errno value) if an error has occurred. 91 # @notes This would be called herror() and return true/false to parallel 92 # ferror(3), but a networking-related herror(3) function already exists. */ 93 int herrno(hFILE *fp) 94 95 # @abstract Clear the stream's error indicator 96 void hclearerr(hFILE *fp) 97 98 # @abstract Reposition the read/write stream offset 99 # @return The resulting offset within the stream (as per lseek(2)), 100 # or negative if an error occurred. 101 off_t hseek(hFILE *fp, off_t offset, int whence) 102 103 # @abstract Report the current stream offset 104 # @return The offset within the stream, starting from zero. 105 off_t htell(hFILE *fp) 106 107 # @abstract Read one character from the stream 108 # @return The character read, or EOF on end-of-file or error 109 int hgetc(hFILE *fp) 110 111 # Read from the stream until the delimiter, up to a maximum length 112 # @param buffer The buffer into which bytes will be written 113 # @param size The size of the buffer 114 # @param delim The delimiter (interpreted as an `unsigned char`) 115 # @param fp The file stream 116 # @return The number of bytes read, or negative on error. 117 # @since 1.4 118 # 119 # Bytes will be read into the buffer up to and including a delimiter, until 120 # EOF is reached, or _size-1_ bytes have been written, whichever comes first. 121 # The string will then be terminated with a NUL byte (`\0`). 122 ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) 123 124 # Read a line from the stream, up to a maximum length 125 # @param buffer The buffer into which bytes will be written 126 # @param size The size of the buffer 127 # @param fp The file stream 128 # @return The number of bytes read, or negative on error. 129 # @since 1.4 130 # 131 # Specialization of hgetdelim() for a `\n` delimiter. 132 ssize_t hgetln(char *buffer, size_t size, hFILE *fp) 133 134 # Read a line from the stream, up to a maximum length 135 # @param buffer The buffer into which bytes will be written 136 # @param size The size of the buffer (must be > 1 to be useful) 137 # @param fp The file stream 138 # @return _buffer_ on success, or `NULL` if an error occurred. 139 # @since 1.4 140 # 141 # This function can be used as a replacement for `fgets(3)`, or together with 142 # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. 143 char *hgets(char *buffer, int size, hFILE *fp) 144 145 # @abstract Peek at characters to be read without removing them from buffers 146 # @param fp The file stream 147 # @param buffer The buffer to which the peeked bytes will be written 148 # @param nbytes The number of bytes to peek at; limited by the size of the 149 # internal buffer, which could be as small as 4K. 150 # @return The number of bytes peeked, which may be less than nbytes if EOF 151 # is encountered; or negative, if there was an I/O error. 152 # @notes The characters peeked at remain in the stream's internal buffer, 153 # and will be returned by later hread() etc calls. 154 ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) 155 156 # @abstract Read a block of characters from the file 157 # @return The number of bytes read, or negative if an error occurred. 158 # @notes The full nbytes requested will be returned, except as limited 159 # by EOF or I/O errors. 160 ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) 161 162 # @abstract Write a character to the stream 163 # @return The character written, or EOF if an error occurred. 164 int hputc(int c, hFILE *fp) 165 166 # @abstract Write a string to the stream 167 # @return 0 if successful, or EOF if an error occurred. 168 int hputs(const char *text, hFILE *fp) 169 170 # @abstract Write a block of characters to the file 171 # @return Either nbytes, or negative if an error occurred. 172 # @notes In the absence of I/O errors, the full nbytes will be written. 173 ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) 174 175 # @abstract For writing streams, flush buffered output to the underlying stream 176 # @return 0 if successful, or EOF if an error occurred. 177 int hflush(hFILE *fp) 178 179 180cdef extern from "htslib/bgzf.h" nogil: 181 ctypedef struct bgzf_mtaux_t 182 ctypedef struct bgzidx_t 183 ctypedef struct z_stream 184 185 ctypedef struct BGZF: 186 unsigned errcode 187 unsigned is_write 188 int is_be 189 int compress_level 190 int is_compressed 191 int is_gzip 192 int cache_size 193 int64_t block_address 194 int64_t uncompressed_address 195 void *uncompressed_block 196 void *compressed_block 197 void *cache 198 hFILE *fp 199 bgzf_mtaux_t *mt 200 bgzidx_t *idx 201 int idx_build_otf 202 z_stream *gz_stream 203 204 #***************** 205 # Basic routines * 206 # *****************/ 207 208 # Open an existing file descriptor for reading or writing. 209 # 210 # @param fd file descriptor 211 # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for 212 # writing, 'a' for appending, 'g' for gzip rather than BGZF 213 # compression (with 'w' only), and digit specifies the zlib 214 # compression level. 215 # Note that there is a distinction between 'u' and '0': the 216 # first yields plain uncompressed output whereas the latter 217 # outputs uncompressed data wrapped in the zlib format. 218 # @return BGZF file handler; 0 on error 219 220 BGZF* bgzf_dopen(int fd, const char *mode) 221 BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility 222 223 # Open the specified file for reading or writing. 224 BGZF* bgzf_open(const char* path, const char *mode) 225 226 # Open an existing hFILE stream for reading or writing. 227 BGZF* bgzf_hopen(hFILE *fp, const char *mode) 228 229 # Close the BGZF and free all associated resources. 230 # 231 # @param fp BGZF file handler 232 # @return 0 on success and -1 on error 233 int bgzf_close(BGZF *fp) 234 235 # Read up to _length_ bytes from the file storing into _data_. 236 # 237 # @param fp BGZF file handler 238 # @param data data array to read into 239 # @param length size of data to read 240 # @return number of bytes actually read; 0 on end-of-file and -1 on error 241 ssize_t bgzf_read(BGZF *fp, void *data, size_t length) 242 243 # Write _length_ bytes from _data_ to the file. If no I/O errors occur, 244 # the complete _length_ bytes will be written (or queued for writing). 245 # 246 # @param fp BGZF file handler 247 # @param data data array to write 248 # @param length size of data to write 249 # @return number of bytes written (i.e., _length_); negative on error 250 ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) 251 252 # Read up to _length_ bytes directly from the underlying stream without 253 # decompressing. Bypasses BGZF blocking, so must be used with care in 254 # specialised circumstances only. 255 # 256 # @param fp BGZF file handler 257 # @param data data array to read into 258 # @param length number of raw bytes to read 259 # @return number of bytes actually read; 0 on end-of-file and -1 on error 260 ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) 261 262 # Write _length_ bytes directly to the underlying stream without 263 # compressing. Bypasses BGZF blocking, so must be used with care 264 # in specialised circumstances only. 265 # 266 # @param fp BGZF file handler 267 # @param data data array to write 268 # @param length number of raw bytes to write 269 # @return number of bytes actually written; -1 on error 270 ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) 271 272 # Write the data in the buffer to the file. 273 int bgzf_flush(BGZF *fp) 274 275 int SEEK_SET 276 277 # Return a virtual file pointer to the current location in the file. 278 # No interpretation of the value should be made, other than a subsequent 279 # call to bgzf_seek can be used to position the file at the same point. 280 # Return value is non-negative on success. 281 int64_t bgzf_tell(BGZF *fp) 282 283 # Set the file to read from the location specified by _pos_. 284 # 285 # @param fp BGZF file handler 286 # @param pos virtual file offset returned by bgzf_tell() 287 # @param whence must be SEEK_SET 288 # @return 0 on success and -1 on error 289 # / 290 int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) 291 292 # Check if the BGZF end-of-file (EOF) marker is present 293 # 294 # @param fp BGZF file handler opened for reading 295 # @return 1 if the EOF marker is present and correct 296 # 2 if it can't be checked, e.g., because fp isn't seekable 297 # 0 if the EOF marker is absent 298 # -1 (with errno set) on error 299 int bgzf_check_EOF(BGZF *fp) 300 301 # Check if a file is in the BGZF format 302 # 303 # @param fn file name 304 # @return 1 if _fn_ is BGZF; 0 if not or on I/O error 305 int bgzf_is_bgzf(const char *fn) 306 307 #********************* 308 # Advanced routines * 309 #********************* 310 311 # Set the cache size. Only effective when compiled with -DBGZF_CACHE. 312 # 313 # @param fp BGZF file handler 314 # @param size size of cache in bytes; 0 to disable caching (default) 315 void bgzf_set_cache_size(BGZF *fp, int size) 316 317 # Flush the file if the remaining buffer size is smaller than _size_ 318 # @return 0 if flushing succeeded or was not needed; negative on error 319 int bgzf_flush_try(BGZF *fp, ssize_t size) 320 321 # Read one byte from a BGZF file. It is faster than bgzf_read() 322 # @param fp BGZF file handler 323 # @return byte read; -1 on end-of-file or error 324 int bgzf_getc(BGZF *fp) 325 326 # Read one line from a BGZF file. It is faster than bgzf_getc() 327 # 328 # @param fp BGZF file handler 329 # @param delim delimiter 330 # @param str string to write to; must be initialized 331 # @return length of the string; 0 on end-of-file; negative on error 332 int bgzf_getline(BGZF *fp, int delim, kstring_t *str) 333 334 # Read the next BGZF block. 335 int bgzf_read_block(BGZF *fp) 336 337 # Enable multi-threading (only effective on writing and when the 338 # library was compiled with -DBGZF_MT) 339 # 340 # @param fp BGZF file handler; must be opened for writing 341 # @param n_threads #threads used for writing 342 # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended 343 int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) 344 345 346 # Compress a single BGZF block. 347 # 348 # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) 349 # @param dlen size of output buffer; updated on return to the number 350 # of bytes actually written to dst 351 # @param src buffer to be compressed 352 # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) 353 # @param level compression level 354 # @return 0 on success and negative on error 355 # 356 int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) 357 358 #******************* 359 # bgzidx routines * 360 # BGZF at the uncompressed offset 361 # 362 # @param fp BGZF file handler; must be opened for reading 363 # @param uoffset file offset in the uncompressed data 364 # @param where SEEK_SET supported atm 365 # 366 # Returns 0 on success and -1 on error. 367 int bgzf_useek(BGZF *fp, long uoffset, int where) 368 369 # Position in uncompressed BGZF 370 # 371 # @param fp BGZF file handler; must be opened for reading 372 # 373 # Returns the current offset on success and -1 on error. 374 long bgzf_utell(BGZF *fp) 375 376 # Tell BGZF to build index while compressing. 377 # 378 # @param fp BGZF file handler; can be opened for reading or writing. 379 # 380 # Returns 0 on success and -1 on error. 381 int bgzf_index_build_init(BGZF *fp) 382 383 # Load BGZF index 384 # 385 # @param fp BGZF file handler 386 # @param bname base name 387 # @param suffix suffix to add to bname (can be NULL) 388 # 389 # Returns 0 on success and -1 on error. 390 int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) 391 392 # Save BGZF index 393 # 394 # @param fp BGZF file handler 395 # @param bname base name 396 # @param suffix suffix to add to bname (can be NULL) 397 # 398 # Returns 0 on success and -1 on error. 399 int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) 400 401 402cdef extern from "htslib/hts.h" nogil: 403 uint32_t kroundup32(uint32_t x) 404 405 ctypedef struct cram_fd 406 407 union FilePointerUnion: 408 BGZF *bgzf 409 cram_fd *cram 410 hFILE *hfile 411 void *voidp 412 413 enum htsFormatCategory: 414 unknown_category 415 sequence_data # Sequence data -- SAM, BAM, CRAM, etc 416 variant_data # Variant calling data -- VCF, BCF, etc 417 index_file # Index file associated with some data file 418 region_list # Coordinate intervals or regions -- BED, etc 419 category_maximum 420 421 enum htsExactFormat: 422 unknown_format 423 binary_format 424 text_format 425 sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed 426 format_maximum 427 428 enum htsCompression: 429 no_compression, gzip, bgzf, custom 430 compression_maximum 431 432 cdef enum hts_fmt_option: 433 CRAM_OPT_DECODE_MD, 434 CRAM_OPT_PREFIX, 435 CRAM_OPT_VERBOSITY, 436 CRAM_OPT_SEQS_PER_SLICE, 437 CRAM_OPT_SLICES_PER_CONTAINER, 438 CRAM_OPT_RANGE, 439 CRAM_OPT_VERSION, 440 CRAM_OPT_EMBED_REF, 441 CRAM_OPT_IGNORE_MD5, 442 CRAM_OPT_REFERENCE, 443 CRAM_OPT_MULTI_SEQ_PER_SLICE, 444 CRAM_OPT_NO_REF, 445 CRAM_OPT_USE_BZIP2, 446 CRAM_OPT_SHARED_REF, 447 CRAM_OPT_NTHREADS, 448 CRAM_OPT_THREAD_POOL, 449 CRAM_OPT_USE_LZMA, 450 CRAM_OPT_USE_RANS, 451 CRAM_OPT_REQUIRED_FIELDS, 452 HTS_OPT_COMPRESSION_LEVEL, 453 HTS_OPT_NTHREADS, 454 455 ctypedef struct htsVersion: 456 short major, minor 457 458 ctypedef struct htsFormat: 459 htsFormatCategory category 460 htsExactFormat format 461 htsVersion version 462 htsCompression compression 463 short compression_level 464 void *specific 465 466 ctypedef struct htsFile: 467 uint8_t is_bin 468 uint8_t is_write 469 uint8_t is_be 470 uint8_t is_cram 471 int64_t lineno 472 kstring_t line 473 char *fn 474 char *fn_aux 475 FilePointerUnion fp 476 htsFormat format 477 478 int hts_verbose 479 480 cdef union hts_opt_val_union: 481 int i 482 char *s 483 484 ctypedef struct hts_opt: 485 char *arg 486 hts_fmt_option opt 487 hts_opt_val_union val 488 void *next 489 490 # @abstract Parses arg and appends it to the option list. 491 # @return 0 on success and -1 on failure 492 int hts_opt_add(hts_opt **opts, const char *c_arg) 493 494 # @abstract Applies an hts_opt option list to a given htsFile. 495 # @return 0 on success and -1 on failure 496 int hts_opt_apply(htsFile *fp, hts_opt *opts) 497 498 # @abstract Frees an hts_opt list. 499 void hts_opt_free(hts_opt *opts) 500 501 # @abstract Table for converting a nucleotide character to 4-bit encoding. 502 # The input character may be either an IUPAC ambiguity code, '=' for 0, or 503 # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 504 # for A/C/G/T or combinations of these bits for ambiguous bases. 505 const unsigned char *seq_nt16_table 506 507 # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC 508 # ambiguity code letter (or '=' when given 0). 509 const char *seq_nt16_str 510 511 # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. 512 # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). 513 const int *seq_nt16_int 514 515 # @abstract Get the htslib version number 516 # @return For released versions, a string like "N.N[.N]"; or git describe 517 # output if using a library built within a Git repository. 518 const char *hts_version() 519 520 # @abstract Determine format by peeking at the start of a file 521 # @param fp File opened for reading, positioned at the beginning 522 # @param fmt Format structure that will be filled out on return 523 # @return 0 for success, or negative if an error occurred. 524 int hts_detect_format(hFILE *fp, htsFormat *fmt) 525 526 # @abstract Get a human-readable description of the file format 527 # @return Description string, to be freed by the caller after use. 528 char *hts_format_description(const htsFormat *format) 529 530 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file 531 # @param fn The file name or "-" for stdin/stdout 532 # @param mode Mode matching / [rwa][bceguxz0-9]* / 533 # @discussion 534 # With 'r' opens for reading; any further format mode letters are ignored 535 # as the format is detected by checking the first few bytes or BGZF blocks 536 # of the file. With 'w' or 'a' opens for writing or appending, with format 537 # specifier letters: 538 # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) 539 # c CRAM format 540 # g gzip compressed 541 # u uncompressed 542 # z bgzf compressed 543 # [0-9] zlib compression level 544 # and with non-format option letters (for any of 'r'/'w'/'a'): 545 # e close the file on exec(2) (opens with O_CLOEXEC, where supported) 546 # x create the file exclusively (opens with O_EXCL, where supported) 547 # Note that there is a distinction between 'u' and '0': the first yields 548 # plain uncompressed output whereas the latter outputs uncompressed data 549 # wrapped in the zlib format. 550 # @example 551 # [rw]b .. compressed BCF, BAM, FAI 552 # [rw]bu .. uncompressed BCF 553 # [rw]z .. compressed VCF 554 # [rw] .. uncompressed VCF 555 htsFile *hts_open(const char *fn, const char *mode) 556 557 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file 558 # @param fn The file name or "-" for stdin/stdout 559 # @param mode Open mode, as per hts_open() 560 # @param fmt Optional format specific parameters 561 # @discussion 562 # See hts_open() for description of fn and mode. 563 # // TODO Update documentation for s/opts/fmt/ 564 # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, 565 # if defined, override mode. Opts also contains a linked list of hts_opt 566 # structures to apply to the open file handle. These can contain things 567 # like pointers to the reference or information on compression levels, 568 # block sizes, etc. 569 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) 570 571 # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file 572 # @param fp The already-open file handle 573 # @param fn The file name or "-" for stdin/stdout 574 # @param mode Open mode, as per hts_open() 575 htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) 576 577 # @abstract Close a file handle, flushing buffered data for output streams 578 # @param fp The file handle to be closed 579 # @return 0 for success, or negative if an error occurred. 580 int hts_close(htsFile *fp) 581 582 # @abstract Returns the file's format information 583 # @param fp The file handle 584 # @return Read-only pointer to the file's htsFormat. 585 const htsFormat *hts_get_format(htsFile *fp) 586 587 # @ abstract Returns a string containing the file format extension. 588 # @ param format Format structure containing the file type. 589 # @ return A string ("sam", "bam", etc) or "?" for unknown formats. 590 const char *hts_format_file_extension(const htsFormat *format) 591 592 # @abstract Sets a specified CRAM option on the open file handle. 593 # @param fp The file handle open the open file. 594 # @param opt The CRAM_OPT_* option. 595 # @param ... Optional arguments, dependent on the option used. 596 # @return 0 for success, or negative if an error occurred. 597 int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) 598 599 int hts_getline(htsFile *fp, int delimiter, kstring_t *str) 600 char **hts_readlines(const char *fn, int *_n) 601 602 # @abstract Parse comma-separated list or read list from a file 603 # @param list File name or comma-separated list 604 # @param is_file 605 # @param _n Size of the output array (number of items read) 606 # @return NULL on failure or pointer to newly allocated array of 607 # strings 608 char **hts_readlist(const char *fn, int is_file, int *_n) 609 610 # @abstract Create extra threads to aid compress/decompression for this file 611 # @param fp The file handle 612 # @param n The number of worker threads to create 613 # @return 0 for success, or negative if an error occurred. 614 # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. 615 int hts_set_threads(htsFile *fp, int n) 616 617 # @abstract Set .fai filename for a file opened for reading 618 # @return 0 for success, negative on failure 619 # @discussion 620 # Called before *_hdr_read(), this provides the name of a .fai file 621 # used to provide a reference list if the htsFile contains no @SQ headers. 622 int hts_set_fai_filename(htsFile *fp, const char *fn_aux) 623 624 int8_t HTS_IDX_NOCOOR 625 int8_t HTS_IDX_START 626 int8_t HTS_IDX_REST 627 int8_t HTS_IDX_NONE 628 629 int8_t HTS_FMT_CSI 630 int8_t HTS_FMT_BAI 631 int8_t HTS_FMT_TBI 632 int8_t HTS_FMT_CRAI 633 634 BGZF *hts_get_bgzfp(htsFile *fp) 635 636 ctypedef struct hts_idx_t 637 638 ctypedef struct hts_pair64_t: 639 uint64_t u, v 640 641 ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) 642 643 ctypedef struct hts_bins_t: 644 int n, m 645 int *a 646 647 ctypedef struct hts_itr_t: 648 uint32_t read_rest 649 uint32_t finished 650 int tid, bed, end, n_off, i 651 int curr_tid, curr_beg, curr_end 652 uint64_t curr_off 653 hts_pair64_t *off 654 hts_readrec_func *readfunc 655 hts_bins_t bins 656 657 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) 658 void hts_idx_destroy(hts_idx_t *idx) 659 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) 660 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) 661 662 #### Save an index to a file 663 # @param idx Index to be written 664 # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added 665 # @param fmt One of the HTS_FMT_* index formats 666 # @return 0 if successful, or negative if an error occurred. 667 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) 668 669 #### Save an index to a specific file 670 # @param idx Index to be written 671 # @param fn Input BAM/BCF/etc filename 672 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 673 # @param fmt One of the HTS_FMT_* index formats 674 # @return 0 if successful, or negative if an error occurred. 675 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) 676 677 #### Load an index file 678 # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or 679 # the extension substituted, to search for an existing index file 680 # @param fmt One of the HTS_FMT_* index formats 681 # @return The index, or NULL if an error occurred. 682 hts_idx_t *hts_idx_load(const char *fn, int fmt) 683 684 #### Load a specific index file 685 # @param fn Input BAM/BCF/etc filename 686 # @param fnidx The input index filename 687 # @return The index, or NULL if an error occurred. 688 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) 689 690 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) 691 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) 692 693 int hts_idx_get_stat(const hts_idx_t* idx, int tid, 694 uint64_t* mapped, uint64_t* unmapped) 695 696 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) 697 698 int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers 699 700 # Parse a numeric string 701 # The number may be expressed in scientific notation, and optionally may 702 # contain commas in the integer part (before any decimal point or E notation). 703 # @param str String to be parsed 704 # @param strend If non-NULL, set on return to point to the first character 705 # in @a str after those forming the parsed number 706 # @param flags Or'ed-together combination of HTS_PARSE_* flags 707 # @return Converted value of the parsed number. 708 # 709 # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 710 # or more) if there are any trailing characters after the number. 711 long long hts_parse_decimal(const char *str, char **strend, int flags) 712 713 # Parse a "CHR:START-END"-style region string 714 # @param str String to be parsed 715 # @param beg Set on return to the 0-based start of the region 716 # @param end Set on return to the 1-based end of the region 717 # @return Pointer to the colon or '\0' after the reference sequence name, 718 # or NULL if @a str could not be parsed. 719 const char *hts_parse_reg(const char *str, int *beg, int *end) 720 721 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) 722 void hts_itr_destroy(hts_itr_t *iter) 723 724 ctypedef int (*hts_name2id_f)(void*, const char*) 725 ctypedef const char *(*hts_id2name_f)(void*, int) 726 ctypedef hts_itr_t *hts_itr_query_func( 727 const hts_idx_t *idx, 728 int tid, 729 int beg, 730 int end, 731 hts_readrec_func *readrec) 732 733 hts_itr_t *hts_itr_querys( 734 const hts_idx_t *idx, 735 const char *reg, 736 hts_name2id_f getid, 737 void *hdr, 738 hts_itr_query_func *itr_query, 739 hts_readrec_func *readrec) 740 741 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) 742 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values 743 744 # hts_file_type() - Convenience function to determine file type 745 # @fname: the file name 746 # 747 # Returns one of the FT_* defines. 748 # 749 # DEPRECATED: This function has been replaced by hts_detect_format(). 750 # It and these FT_* macros will be removed in a future HTSlib release. 751 int FT_UNKN 752 int FT_GZ 753 int FT_VCF 754 int FT_VCF_GZ 755 int FT_BCF 756 int FT_BCF_GZ 757 int FT_STDIN 758 759 int hts_file_type(const char *fname) 760 761 # /*************************** 762 # * Revised MAQ error model * 763 # ***************************/ 764 765 ctypedef struct errmod_t 766 767 errmod_t *errmod_init(double depcorr) 768 void errmod_destroy(errmod_t *em) 769 770 # /* 771 # n: number of bases 772 # m: maximum base 773 # bases[i]: qual:6, strand:1, base:4 774 # q[i*m+j]: phred-scaled likelihood of (i,j) 775 # */ 776 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic) 777 778 # /***************************************** 779 # * q banded glocal alignment * 780 # *****************************************/ 781 782 ctypedef struct probaln_par_t: 783 float d, e 784 int bw 785 786 int probaln_glocal(const uint8_t *ref, 787 int l_ref, 788 const uint8_t *query, 789 int l_query, const uint8_t *iqual, 790 const probaln_par_t *c, 791 int *state, uint8_t *q) 792 793 # /********************** 794 # * MD5 implementation * 795 # **********************/ 796 797 ctypedef struct hts_md5_context 798 799 # /*! @abstract Initialises an MD5 context. 800 # * @discussion 801 # * The expected use is to allocate an hts_md5_context using 802 # * hts_md5_init(). This pointer is then passed into one or more calls 803 # * of hts_md5_update() to compute successive internal portions of the 804 # * MD5 sum, which can then be externalised as a full 16-byte MD5sum 805 # * calculation by calling hts_md5_final(). This can then be turned 806 # * into ASCII via hts_md5_hex(). 807 # * 808 # * To dealloate any resources created by hts_md5_init() call the 809 # * hts_md5_destroy() function. 810 # * 811 # * @return hts_md5_context pointer on success, NULL otherwise. 812 # */ 813 hts_md5_context *hts_md5_init() 814 815 # /*! @abstract Updates the context with the MD5 of the data. */ 816 void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size) 817 818 # /*! @abstract Computes the final 128-bit MD5 hash from the given context */ 819 void hts_md5_final(unsigned char *digest, hts_md5_context *ctx) 820 821 # /*! @abstract Resets an md5_context to the initial state, as returned 822 # * by hts_md5_init(). 823 # */ 824 void hts_md5_reset(hts_md5_context *ctx) 825 826 # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated 827 # * hex string. 828 # */ 829 void hts_md5_hex(char *hex, const unsigned char *digest) 830 831 # /*! @abstract Deallocates any memory allocated by hts_md5_init. */ 832 void hts_md5_destroy(hts_md5_context *ctx) 833 834 int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) 835 int hts_bin_bot(int bin, int n_lvls) 836 837 # * Endianness * 838 int ed_is_big() 839 uint16_t ed_swap_2(uint16_t v) 840 void *ed_swap_2p(void *x) 841 uint32_t ed_swap_4(uint32_t v) 842 void *ed_swap_4p(void *x) 843 uint64_t ed_swap_8(uint64_t v) 844 void *ed_swap_8p(void *x) 845 846 847cdef extern from "htslib/sam.h" nogil: 848 #********************** 849 #*** SAM/BAM header *** 850 #********************** 851 852 # @abstract Structure for the alignment header. 853 # @field n_targets number of reference sequences 854 # @field l_text length of the plain text in the header 855 # @field target_len lengths of the reference sequences 856 # @field target_name names of the reference sequences 857 # @field text plain text 858 # @field sdict header dictionary 859 860 ctypedef struct bam_hdr_t: 861 int32_t n_targets, ignore_sam_err 862 uint32_t l_text 863 uint32_t *target_len 864 uint8_t *cigar_tab 865 char **target_name 866 char *text 867 void *sdict 868 869 #**************************** 870 #*** CIGAR related macros *** 871 #**************************** 872 873 int BAM_CMATCH 874 int BAM_CINS 875 int BAM_CDEL 876 int BAM_CREF_SKIP 877 int BAM_CSOFT_CLIP 878 int BAM_CHARD_CLIP 879 int BAM_CPAD 880 int BAM_CEQUAL 881 int BAM_CDIFF 882 int BAM_CBACK 883 884 char *BAM_CIGAR_STR 885 int BAM_CIGAR_SHIFT 886 uint32_t BAM_CIGAR_MASK 887 uint32_t BAM_CIGAR_TYPE 888 889 char bam_cigar_op(uint32_t c) 890 uint32_t bam_cigar_oplen(uint32_t c) 891 char bam_cigar_opchr(uint32_t) 892 uint32_t bam_cigar_gen(char, uint32_t) 893 int bam_cigar_type(char o) 894 895 # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair 896 int BAM_FPAIRED 897 # @abstract the read is mapped in a proper pair 898 int BAM_FPROPER_PAIR 899 # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR 900 int BAM_FUNMAP 901 # @abstract the mate is unmapped 902 int BAM_FMUNMAP 903 # @abstract the read is mapped to the reverse strand 904 int BAM_FREVERSE 905 # @abstract the mate is mapped to the reverse strand 906 int BAM_FMREVERSE 907 # @abstract this is read1 908 int BAM_FREAD1 909 # @abstract this is read2 910 int BAM_FREAD2 911 # @abstract not primary alignment 912 int BAM_FSECONDARY 913 # @abstract QC failure 914 int BAM_FQCFAIL 915 # @abstract optical or PCR duplicate 916 int BAM_FDUP 917 # @abstract supplementary alignment 918 int BAM_FSUPPLEMENTARY 919 920 #************************* 921 #*** Alignment records *** 922 #************************* 923 924 # @abstract Structure for core alignment information. 925 # @field tid chromosome ID, defined by bam_hdr_t 926 # @field pos 0-based leftmost coordinate 927 # @field bin bin calculated by bam_reg2bin() 928 # @field qual mapping quality 929 # @field l_qname length of the query name 930 # @field flag bitwise flag 931 # @field n_cigar number of CIGAR operations 932 # @field l_qseq length of the query sequence (read) 933 # @field mtid chromosome ID of next read in template, defined by bam_hdr_t 934 # @field mpos 0-based leftmost coordinate of next read in template 935 936 ctypedef struct bam1_core_t: 937 int32_t tid 938 int32_t pos 939 uint16_t bin 940 uint8_t qual 941 uint8_t l_qname 942 uint16_t flag 943 uint8_t unused1 944 uint8_t l_extranul 945 uint32_t n_cigar 946 int32_t l_qseq 947 int32_t mtid 948 int32_t mpos 949 int32_t isize 950 951 # @abstract Structure for one alignment. 952 # @field core core information about the alignment 953 # @field l_data current length of bam1_t::data 954 # @field m_data maximum length of bam1_t::data 955 # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux 956 # 957 # @discussion Notes: 958 # 959 # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. 960 # 2. l_qseq is calculated from the total length of an alignment block 961 # on reading or from CIGAR. 962 # 3. cigar data is encoded 4 bytes per CIGAR operation. 963 # 4. seq is nybble-encoded according to seq_nt16_table. 964 ctypedef struct bam1_t: 965 bam1_core_t core 966 int l_data 967 uint32_t m_data 968 uint8_t *data 969 uint64_t id 970 971 # @abstract Get whether the query is on the reverse strand 972 # @param b pointer to an alignment 973 # @return boolean true if query is on the reverse strand 974 int bam_is_rev(bam1_t *b) 975 976 # @abstract Get whether the query's mate is on the reverse strand 977 # @param b pointer to an alignment 978 # @return boolean true if query's mate on the reverse strand 979 int bam_is_mrev(bam1_t *b) 980 981 # @abstract Get the name of the query 982 # @param b pointer to an alignment 983 # @return pointer to the name string, null terminated 984 char *bam_get_qname(bam1_t *b) 985 986 # @abstract Get the CIGAR array 987 # @param b pointer to an alignment 988 # @return pointer to the CIGAR array 989 # 990 # @discussion In the CIGAR array, each element is a 32-bit integer. The 991 # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the 992 # length of a CIGAR. 993 uint32_t *bam_get_cigar(bam1_t *b) 994 995 # @abstract Get query sequence 996 # @param b pointer to an alignment 997 # @return pointer to sequence 998 # 999 # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 1000 # 8 for T and 15 for N. Two bases are packed in one byte with the base 1001 # at the higher 4 bits having smaller coordinate on the read. It is 1002 # recommended to use bam_seqi() macro to get the base. 1003 char *bam_get_seq(bam1_t *b) 1004 1005 # @abstract Get query quality 1006 # @param b pointer to an alignment 1007 # @return pointer to quality string 1008 uint8_t *bam_get_qual(bam1_t *b) 1009 1010 # @abstract Get auxiliary data 1011 # @param b pointer to an alignment 1012 # @return pointer to the concatenated auxiliary data 1013 uint8_t *bam_get_aux(bam1_t *b) 1014 1015 # @abstract Get length of auxiliary data 1016 # @param b pointer to an alignment 1017 # @return length of the concatenated auxiliary data 1018 int bam_get_l_aux(bam1_t *b) 1019 1020 # @abstract Get a base on read 1021 # @param s Query sequence returned by bam1_seq() 1022 # @param i The i-th position, 0-based 1023 # @return 4-bit integer representing the base. 1024 char bam_seqi(char *s, int i) 1025 1026 #************************** 1027 #*** Exported functions *** 1028 #************************** 1029 1030 #*************** 1031 #*** BAM I/O *** 1032 #*************** 1033 1034 bam_hdr_t *bam_hdr_init() 1035 bam_hdr_t *bam_hdr_read(BGZF *fp) 1036 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) 1037 void bam_hdr_destroy(bam_hdr_t *h) 1038 int bam_name2id(bam_hdr_t *h, const char *ref) 1039 bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) 1040 1041 bam1_t *bam_init1() 1042 void bam_destroy1(bam1_t *b) 1043 int bam_read1(BGZF *fp, bam1_t *b) 1044 int bam_write1(BGZF *fp, const bam1_t *b) 1045 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) 1046 bam1_t *bam_dup1(const bam1_t *bsrc) 1047 1048 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) 1049 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) 1050 1051 # @abstract Calculate the rightmost base position of an alignment on the 1052 # reference genome. 1053 1054 # @param b pointer to an alignment 1055 # @return the coordinate of the first base after the alignment, 0-based 1056 1057 # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. 1058 # For an unmapped read (either according to its flags or if it has no cigar 1059 # string), we return b->core.pos + 1 by convention. 1060 int32_t bam_endpos(const bam1_t *b) 1061 1062 int bam_str2flag(const char *str) # returns negative value on error 1063 char *bam_flag2str(int flag) # The string must be freed by the user 1064 1065 #************************* 1066 #*** BAM/CRAM indexing *** 1067 #************************* 1068 1069 # These BAM iterator functions work only on BAM files. To work with either 1070 # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. 1071 void bam_itr_destroy(hts_itr_t *iter) 1072 hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) 1073 hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) 1074 int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) 1075 1076 # Load/build .csi or .bai BAM index file. Does not work with CRAM. 1077 # It is recommended to use the sam_index_* functions below instead. 1078 hts_idx_t *bam_index_load(const char *fn) 1079 int bam_index_build(const char *fn, int min_shift) 1080 1081 # Load a BAM (.csi or .bai) or CRAM (.crai) index file 1082 # @param fp File handle of the data file whose index is being opened 1083 # @param fn BAM/CRAM/etc filename to search alongside for the index file 1084 # @return The index, or NULL if an error occurred. 1085 hts_idx_t *sam_index_load(htsFile *fp, const char *fn) 1086 1087 # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file 1088 # @param fp File handle of the data file whose index is being opened 1089 # @param fn BAM/CRAM/etc data file filename 1090 # @param fnidx Index filename, or NULL to search alongside @a fn 1091 # @return The index, or NULL if an error occurred. 1092 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) 1093 1094 # Generate and save an index file 1095 # @param fn Input BAM/etc filename, to which .csi/etc will be added 1096 # @param min_shift Positive to generate CSI, or 0 to generate BAI 1097 # @return 0 if successful, or negative if an error occurred (usually -1; or 1098 # -2: opening fn failed; -3: format not indexable) 1099 int sam_index_build(const char *fn, int min_shift) 1100 1101 # Generate and save an index to a specific file 1102 # @param fn Input BAM/CRAM/etc filename 1103 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1104 # @param min_shift Positive to generate CSI, or 0 to generate BAI 1105 # @return 0 if successful, or negative if an error occurred. 1106 int sam_index_build2(const char *fn, const char *fnidx, int min_shift) 1107 1108 void sam_itr_destroy(hts_itr_t *iter) 1109 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) 1110 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) 1111 int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) 1112 1113 #*************** 1114 #*** SAM I/O *** 1115 #*************** 1116 1117 htsFile *sam_open(const char *fn, const char *mode) 1118 htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) 1119 int sam_close(htsFile *fp) 1120 1121 int sam_open_mode(char *mode, const char *fn, const char *format) 1122 1123 # A version of sam_open_mode that can handle ,key=value options. 1124 # The format string is allocated and returned, to be freed by the caller. 1125 # Prefix should be "r" or "w", 1126 char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) 1127 1128 bam_hdr_t *sam_hdr_parse(int l_text, const char *text) 1129 bam_hdr_t *sam_hdr_read(htsFile *fp) 1130 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) 1131 1132 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) 1133 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) 1134 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) 1135 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) 1136 1137 #************************************* 1138 #*** Manipulating auxiliary fields *** 1139 #************************************* 1140 1141 uint8_t *bam_aux_get(const bam1_t *b, const char *tag) 1142 int64_t bam_aux2i(const uint8_t *s) 1143 double bam_aux2f(const uint8_t *s) 1144 char bam_aux2A(const uint8_t *s) 1145 char *bam_aux2Z(const uint8_t *s) 1146 1147 void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) 1148 int bam_aux_del(bam1_t *b, uint8_t *s) 1149 1150 #************************** 1151 #*** Pileup and Mpileup *** 1152 #************************** 1153 1154 # @abstract Generic pileup 'client data'. 1155 # @discussion The pileup iterator allows setting a constructor and 1156 # destructor function, which will be called every time a sequence is 1157 # fetched and discarded. This permits caching of per-sequence data in 1158 # a tidy manner during the pileup process. This union is the cached 1159 # data to be manipulated by the "client" (the caller of pileup). 1160 # 1161 union bam_pileup_cd: 1162 void *p 1163 int64_t i 1164 double f 1165 1166 # @abstract Structure for one alignment covering the pileup position. 1167 # @field b pointer to the alignment 1168 # @field qpos position of the read base at the pileup site, 0-based 1169 # @field indel indel length; 0 for no indel, positive for ins and negative for del 1170 # @field level the level of the read in the "viewer" mode 1171 # @field is_del 1 iff the base on the padded read is a deletion 1172 # @field is_head ??? 1173 # @field is_tail ??? 1174 # @field is_refskip ??? 1175 # @field aux ??? 1176 # 1177 # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The 1178 # difference between the two functions is that the former does not 1179 # set bam_pileup1_t::level, while the later does. Level helps the 1180 # implementation of alignment viewers, but calculating this has some 1181 # overhead. 1182 # 1183 # is_del, is_head, etc are a bit field, declaring as below should 1184 # work as expected, see 1185 # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J 1186 1187 ctypedef struct bam_pileup1_t: 1188 bam1_t *b 1189 int32_t qpos 1190 int indel, level 1191 uint32_t is_del 1192 uint32_t is_head 1193 uint32_t is_tail 1194 uint32_t is_refskip 1195 uint32_t aux 1196 bam_pileup_cd cd 1197 1198 ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) 1199 ctypedef int (*bam_test_f)() 1200 1201 ctypedef struct __bam_plp_t 1202 ctypedef __bam_plp_t *bam_plp_t 1203 1204 ctypedef struct __bam_mplp_t 1205 ctypedef __bam_mplp_t *bam_mplp_t 1206 1207 # bam_plp_init() - sets an iterator over multiple 1208 # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return 1209 # status: 0 on success, -1 on end, < -1 on non-recoverable errors 1210 # @data: user data to pass to @func 1211 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) 1212 void bam_plp_destroy(bam_plp_t iter) 1213 int bam_plp_push(bam_plp_t iter, const bam1_t *b) 1214 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) 1215 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) 1216 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) 1217 void bam_plp_reset(bam_plp_t iter) 1218 1219 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) 1220 1221 # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping 1222 # read pairs and for each base pair set the base quality of the 1223 # lower-quality base to zero, thus effectively discarding it from 1224 # calling. If the two bases are identical, the quality of the other base 1225 # is increased to the sum of their qualities (capped at 200), otherwise 1226 # it is multiplied by 0.8. 1227 void bam_mplp_init_overlaps(bam_mplp_t iter) 1228 void bam_mplp_destroy(bam_mplp_t iter) 1229 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) 1230 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) 1231 void bam_mplp_reset(bam_mplp_t iter) 1232 void bam_mplp_constructor(bam_mplp_t iter, 1233 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) 1234 void bam_mplp_destructor(bam_mplp_t iter, 1235 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) 1236 1237 # Added by AH 1238 # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" 1239 1240 # *********************************** 1241 # * BAQ calculation and realignment * 1242 # ***********************************/ 1243 int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) 1244 int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) 1245 1246 1247cdef extern from "htslib/faidx.h" nogil: 1248 1249 ctypedef struct faidx_t: 1250 pass 1251 1252 # /// Build index for a FASTA or bgzip-compressed FASTA file. 1253 # /** @param fn FASTA file name 1254 # @param fnfai Name of .fai file to build. 1255 # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 1256 # @return 0 on success; or -1 on failure 1257 1258 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 1259 # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 1260 # file will only be built if fn is bgzip-compressed. 1261 # */ 1262 int fai_build3(const char *fn, 1263 const char *fnfai, 1264 const char *fngzi) 1265 1266 # /// Build index for a FASTA or bgzip-compressed FASTA file. 1267 # /** @param fn FASTA file name 1268 # @return 0 on success; or -1 on failure 1269 # 1270 # File "fn.fai" will be generated. This function is equivalent to 1271 # fai_build3(fn, NULL, NULL); 1272 # */ 1273 int fai_build(char *fn) 1274 1275 # /// Destroy a faidx_t struct 1276 void fai_destroy(faidx_t *fai) 1277 1278 # /// Load FASTA indexes. 1279 # /** @param fn File name of the FASTA file (can be compressed with bgzip). 1280 # @param fnfai File name of the FASTA index. 1281 # @param fngzi File name of the bgzip index. 1282 # @param flags Option flags to control index file caching and creation. 1283 # @return Pointer to a faidx_t struct on success, NULL on failure. 1284 1285 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 1286 # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. 1287 # The bgzip index is only needed if fn is compressed. 1288 1289 # If (flags & FAI_CREATE) is true, the index files will be built using 1290 # fai_build3() if they are not already present. 1291 # */ 1292 faidx_t *fai_load3(const char *fn, 1293 const char *fnfai, 1294 const char *fngzi, 1295 int flags) 1296 1297 # /// Load index from "fn.fai". 1298 # /** @param fn File name of the FASTA file 1299 # @return Pointer to a faidx_t struct on success, NULL on failure. 1300 # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); 1301 # */ 1302 faidx_t *fai_load(char *fn) 1303 1304 # /// Fetch the sequence in a region 1305 # /** @param fai Pointer to the faidx_t struct 1306 # @param reg Region in the format "chr2:20,000-30,000" 1307 # @param len Length of the region; -2 if seq not present, -1 general error 1308 # @return Pointer to the sequence; `NULL` on failure 1309 # The returned sequence is allocated by `malloc()` family and should be destroyed 1310 # by end users by calling `free()` on it. 1311 # */ 1312 char *fai_fetch(faidx_t *fai, 1313 char *reg, 1314 int *len) 1315 1316 # /// Fetch the sequence in a region 1317 # /** @param fai Pointer to the faidx_t struct 1318 # @param c_name Region name 1319 # @param p_beg_i Beginning position number (zero-based) 1320 # @param p_end_i End position number (zero-based) 1321 # @param len Length of the region; -2 if c_name not present, -1 general error 1322 # @return Pointer to the sequence; null on failure 1323 # The returned sequence is allocated by `malloc()` family and should be destroyed 1324 # by end users by calling `free()` on it. 1325 # */ 1326 char *faidx_fetch_seq(faidx_t *fai, 1327 char *c_name, 1328 int p_beg_i, 1329 int p_end_i, 1330 int *len) 1331 1332 # /// Query if sequence is present 1333 # /** @param fai Pointer to the faidx_t struct 1334 # @param seq Sequence name 1335 # @return 1 if present or 0 if absent 1336 # */ 1337 int faidx_has_seq(faidx_t *fai, const char *seq) 1338 1339 # /// Fetch the number of sequences 1340 # /** @param fai Pointer to the faidx_t struct 1341 # @return The number of sequences 1342 # */ 1343 int faidx_nseq(const faidx_t *fai) 1344 1345 # /// Return name of i-th sequence 1346 const char *faidx_iseq(const faidx_t *fai, int i) 1347 1348 # /// Return sequence length, -1 if not present 1349 int faidx_seq_len(faidx_t *fai, const char *seq) 1350 1351# tabix support 1352cdef extern from "htslib/tbx.h" nogil: 1353 1354 # tbx.h definitions 1355 int8_t TBX_MAX_SHIFT 1356 int32_t TBX_GENERIC 1357 int32_t TBX_SAM 1358 int32_t TBX_VCF 1359 int32_t TBX_UCSC 1360 1361 ctypedef struct tbx_conf_t: 1362 int32_t preset 1363 int32_t sc, bc, ec # seq col., beg col. and end col. 1364 int32_t meta_char, line_skip 1365 1366 ctypedef struct tbx_t: 1367 tbx_conf_t conf 1368 hts_idx_t *idx 1369 void * dict 1370 1371 tbx_conf_t tbx_conf_gff 1372 tbx_conf_t tbx_conf_bed 1373 tbx_conf_t tbx_conf_psltbl 1374 tbx_conf_t tbx_conf_sam 1375 tbx_conf_t tbx_conf_vcf 1376 1377 void tbx_itr_destroy(hts_itr_t * iter) 1378 hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) 1379 hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) 1380 int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) 1381 1382 int tbx_name2id(tbx_t *tbx, char *ss) 1383 1384 int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) 1385 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) 1386 1387 tbx_t * tbx_index_load(char *fn) 1388 tbx_t *tbx_index_load2(const char *fn, const char *fnidx) 1389 1390 # free the array but not the values 1391 char **tbx_seqnames(tbx_t *tbx, int *n) 1392 1393 void tbx_destroy(tbx_t *tbx) 1394 1395 1396# VCF/BCF API 1397cdef extern from "htslib/vcf.h" nogil: 1398 1399 # Header struct 1400 1401 uint8_t BCF_HL_FLT # header line 1402 uint8_t BCF_HL_INFO 1403 uint8_t BCF_HL_FMT 1404 uint8_t BCF_HL_CTG 1405 uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..> 1406 uint8_t BCF_HL_GEN # generic header line 1407 1408 uint8_t BCF_HT_FLAG # header type 1409 uint8_t BCF_HT_INT 1410 uint8_t BCF_HT_REAL 1411 uint8_t BCF_HT_STR 1412 1413 uint8_t BCF_VL_FIXED # variable length 1414 uint8_t BCF_VL_VAR 1415 uint8_t BCF_VL_A 1416 uint8_t BCF_VL_G 1417 uint8_t BCF_VL_R 1418 1419 # === Dictionary === 1420 # 1421 # The header keeps three dictionaries. The first keeps IDs in the 1422 # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths 1423 # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] 1424 # is the actual hash table, which is opaque to the end users. In the hash 1425 # table, the key is the ID or sample name as a C string and the value is a 1426 # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash 1427 # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the 1428 # size of the hash table or, equivalently, the length of the id[] arrays. 1429 1430 uint8_t BCF_DT_ID # dictionary type 1431 uint8_t BCF_DT_CTG 1432 uint8_t BCF_DT_SAMPLE 1433 1434 # Complete textual representation of a header line 1435 ctypedef struct bcf_hrec_t: 1436 int type # One of the BCF_HL_* type 1437 char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. 1438 char *value # Set only for generic lines, NULL for FILTER/INFO, etc. 1439 int nkeys # Number of structured fields 1440 char **keys # The key=value pairs 1441 char **vals 1442 1443 ctypedef struct bcf_idinfo_t: 1444 uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] 1445 bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG 1446 int id 1447 1448 ctypedef struct bcf_idpair_t: 1449 const char *key 1450 const bcf_idinfo_t *val 1451 1452 ctypedef struct bcf_hdr_t: 1453 int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) 1454 bcf_idpair_t *id[3] 1455 void *dict[3] # ID dictionary, contig dict and sample dict 1456 char **samples 1457 bcf_hrec_t **hrec 1458 int nhrec, dirty 1459 int ntransl 1460 int *transl[2] # for bcf_translate() 1461 int nsamples_ori # for bcf_hdr_set_samples() 1462 uint8_t *keep_samples 1463 kstring_t mem 1464 int32_t m[3] # m: allocated size of the dictionary block in use (see n above) 1465 1466 uint8_t bcf_type_shift[] 1467 1468 # * VCF record * 1469 1470 uint8_t BCF_BT_NULL 1471 uint8_t BCF_BT_INT8 1472 uint8_t BCF_BT_INT16 1473 uint8_t BCF_BT_INT32 1474 uint8_t BCF_BT_FLOAT 1475 uint8_t BCF_BT_CHAR 1476 1477 uint8_t VCF_REF 1478 uint8_t VCF_SNP 1479 uint8_t VCF_MNP 1480 uint8_t VCF_INDEL 1481 uint8_t VCF_OTHER 1482 1483 ctypedef struct variant_t: 1484 int type, n # variant type and the number of bases affected, negative for deletions 1485 1486 ctypedef struct bcf_fmt_t: 1487 int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key 1488 int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types 1489 uint8_t *p # same as vptr and vptr_* in bcf_info_t below 1490 uint32_t p_len 1491 uint32_t p_off 1492 uint8_t p_free 1493 1494 union bcf_info_union_t: 1495 int32_t i # integer value 1496 float f # float value 1497 1498 ctypedef struct bcf_info_t: 1499 int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key 1500 int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars 1501 1502 # v1 union only set if $len==1; for easier access 1503 bcf_info_union_t v1 1504 uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes 1505 uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset 1506 uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes 1507 uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new 1508 # data block is bigger than the original 1509 1510 uint8_t BCF1_DIRTY_ID 1511 uint8_t BCF1_DIRTY_ALS 1512 uint8_t BCF1_DIRTY_FLT 1513 uint8_t BCF1_DIRTY_INF 1514 1515 ctypedef struct bcf_dec_t: 1516 int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change 1517 int n_flt # Number of FILTER fields 1518 int *flt # FILTER keys in the dictionary 1519 char *id # ID 1520 char *als # REF+ALT block (\0-seperated) 1521 char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated 1522 bcf_info_t *info # INFO 1523 bcf_fmt_t *fmt # FORMAT and individual sample 1524 variant_t *var # $var and $var_type set only when set_variant_types called 1525 int n_var, var_type 1526 int shared_dirty # if set, shared.s must be recreated on BCF output 1527 int indiv_dirty # if set, indiv.s must be recreated on BCF output 1528 1529 uint8_t BCF_ERR_CTG_UNDEF 1530 uint8_t BCF_ERR_TAG_UNDEF 1531 uint8_t BCF_ERR_NCOLS 1532 uint8_t BCF_ERR_LIMITS 1533 uint8_t BCF_ERR_CHAR 1534 uint8_t BCF_ERR_CTG_INVALID 1535 uint8_t BCF_ERR_TAG_INVALID 1536 1537 # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file 1538 # is slower because the string is first to be parsed, packed into BCF line 1539 # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it 1540 # is known in advance that some of the fields will not be required (notably 1541 # the sample columns), parsing of these can be skipped by setting max_unpack 1542 # appropriately. 1543 # Similarly, it is fast to output a BCF line because the columns (kept in 1544 # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF 1545 # line must be formatted in vcf_format. 1546 1547 ctypedef struct bcf1_t: 1548 int32_t rid # CHROM 1549 int32_t pos # POS 1550 int32_t rlen # length of REF 1551 float qual # QUAL 1552 uint32_t n_info, n_allele 1553 uint32_t n_fmt, n_sample 1554 kstring_t shared, indiv 1555 bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() 1556 int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed 1557 int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work 1558 int unpack_size[3] # the original block size of ID, REF+ALT and FILTER 1559 int errcode # one of BCF_ERR_* codes 1560 1561 ####### API ####### 1562 1563 # BCF and VCF I/O 1564 # 1565 # A note about naming conventions: htslib internally represents VCF 1566 # records as bcf1_t data structures, therefore most functions are 1567 # prefixed with bcf_. There are a few exceptions where the functions must 1568 # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In 1569 # these cases, functions prefixed with bcf_ are more general and work 1570 # with both BCF and VCF. 1571 1572 # bcf_hdr_init() - create an empty BCF header. 1573 # @param mode "r" or "w" 1574 # 1575 # When opened for writing, the mandatory fileFormat and 1576 # FILTER=PASS lines are added automatically. 1577 bcf_hdr_t *bcf_hdr_init(const char *mode) 1578 1579 # Destroy a BCF header struct 1580 void bcf_hdr_destroy(bcf_hdr_t *h) 1581 1582 # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) 1583 bcf1_t *bcf_init() 1584 1585 # Deallocate a bcf1_t object 1586 void bcf_destroy(bcf1_t *v) 1587 1588 # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, 1589 # not the bcf1_t object itself. 1590 void bcf_empty(bcf1_t *v) 1591 1592 # Make the bcf1_t object ready for next read. Intended mostly for 1593 # internal use, the user should rarely need to call this function 1594 # directly. 1595 void bcf_clear(bcf1_t *v) 1596 1597 # Reads VCF or BCF header 1598 bcf_hdr_t *bcf_hdr_read(htsFile *fp) 1599 1600 # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed 1601 # @samples: samples to include or exclude from file or as a comma-separated string. 1602 # LIST|FILE .. select samples in list/file 1603 # ^LIST|FILE .. exclude samples from list/file 1604 # - .. include all samples 1605 # NULL .. exclude all samples 1606 # @is_file: @samples is a file (1) or a comma-separated list (0) 1607 # 1608 # The bottleneck of VCF reading is parsing of genotype fields. If the 1609 # reader knows in advance that only subset of samples is needed (possibly 1610 # no samples at all), the performance of bcf_read() can be significantly 1611 # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). 1612 # The function bcf_read() will subset the VCF/BCF records automatically 1613 # with the notable exception when reading records via bcf_itr_next(). 1614 # In this case, bcf_subset_format() must be called explicitly, because 1615 # bcf_readrec() does not see the header. 1616 # 1617 # Returns 0 on success, -1 on error or a positive integer if the list 1618 # contains samples not present in the VCF header. In such a case, the 1619 # return value is the index of the offending sample. 1620 # 1621 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) 1622 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) 1623 1624 # Writes VCF or BCF header 1625 int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) 1626 1627 # Parse VCF line contained in kstring and populate the bcf1_t struct 1628 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) 1629 1630 # The opposite of vcf_parse. It should rarely be called directly, see vcf_write 1631 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) 1632 1633 # bcf_read() - read next VCF or BCF record 1634 # 1635 # Returns -1 on critical errors, 0 otherwise. On errors which are not 1636 # critical for reading, such as missing header definitions, v->errcode is 1637 # set to one of BCF_ERR* code and must be checked before calling 1638 # vcf_write(). 1639 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 1640 1641 # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) 1642 # 1643 # Note that bcf_unpack() must be called even when reading VCF. It is safe 1644 # to call the function repeatedly, it will not unpack the same field 1645 # twice. 1646 uint8_t BCF_UN_STR # up to ALT inclusive 1647 uint8_t BCF_UN_FLT # up to FILTER 1648 uint8_t BCF_UN_INFO # up to INFO 1649 uint8_t BCF_UN_SHR # all shared information 1650 uint8_t BCF_UN_FMT # unpack format and each sample 1651 uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT 1652 uint8_t BCF_UN_ALL # everything 1653 1654 int bcf_unpack(bcf1_t *b, int which) 1655 1656 # bcf_dup() - create a copy of BCF record. 1657 # 1658 # Note that bcf_unpack() must be called on the returned copy as if it was 1659 # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) 1660 # internally to reflect any changes made by bcf_update_* functions. 1661 bcf1_t *bcf_dup(bcf1_t *src) 1662 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) 1663 1664 # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. 1665 int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) 1666 1667 # The following functions work only with VCFs and should rarely be called 1668 # directly. Usually one wants to use their bcf_* alternatives, which work 1669 # transparently with both VCFs and BCFs. 1670 bcf_hdr_t *vcf_hdr_read(htsFile *fp) 1671 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) 1672 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 1673 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 1674 1675 #************************************************************************ 1676 # Header querying and manipulation routines 1677 #************************************************************************ 1678 1679 # Create a new header using the supplied template 1680 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) 1681 1682 # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). 1683 # Returns 0 on success or sets a bit on error: 1684 # 1 .. conflicting definitions of tag length 1685 # # todo 1686 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) 1687 1688 # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() 1689 # @param dst: the destination header to be merged into, NULL on the first pass 1690 # @param src: the source header 1691 # 1692 # Notes: 1693 # - use as: 1694 # bcf_hdr_t *dst = NULL; 1695 # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]); 1696 # 1697 # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when 1698 # combining multiple BCF headers. The current bcf_hdr_combine() 1699 # does not have this problem, but became slow when used for many files. 1700 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) 1701 1702 # bcf_hdr_add_sample() - add a new sample. 1703 # @param sample: sample name to be added 1704 int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample) 1705 1706 # Read VCF header from a file and update the header 1707 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) 1708 1709 # Appends formatted header text to _str_. 1710 # If _is_bcf_ is zero, `IDX` fields are discarded. 1711 # @return 0 if successful, or negative if an error occurred 1712 # @since 1.4 1713 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str); 1714 1715 # Returns formatted header (newly allocated string) and its length, 1716 # excluding the terminating \0. If is_bcf parameter is unset, IDX 1717 # fields are discarded. 1718 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) 1719 1720 # Append new VCF header line, returns 0 on success 1721 int bcf_hdr_append(bcf_hdr_t *h, const char *line) 1722 int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...) 1723 1724 # VCF version, e.g. VCFv4.2 1725 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) 1726 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) 1727 1728 # bcf_hdr_remove() - remove VCF header tag 1729 # @param type: one of BCF_HL_* 1730 # @param key: tag name or NULL to remove all tags of the given type 1731 void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key) 1732 1733 # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples 1734 # @param n: number of samples to keep 1735 # @param samples: names of the samples to keep 1736 # @param imap: mapping from index in @samples to the sample index in the original file 1737 # 1738 # Sample names not present in h0 are ignored. The number of unmatched samples can be checked 1739 # by comparing n and bcf_hdr_nsamples(out_hdr). 1740 # This function can be used to reorder samples. 1741 # See also bcf_subset() which subsets individual records. 1742 # 1743 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) 1744 1745 # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) 1746 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs) 1747 1748 # Get number of samples 1749 int32_t bcf_hdr_nsamples(const bcf_hdr_t *h) 1750 1751 # The following functions are for internal use and should rarely be called directly 1752 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) 1753 int bcf_hdr_sync(bcf_hdr_t *h) 1754 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) 1755 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) 1756 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) 1757 1758 # bcf_hdr_get_hrec() - get header line info 1759 # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN 1760 # @param key: the header key for generic lines (e.g. "fileformat"), any field 1761 # for structured lines, typically "ID". 1762 # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN 1763 # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL 1764 # 1765 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) 1766 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) 1767 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) 1768 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) 1769 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) 1770 void hrec_add_idx(bcf_hrec_t *hrec, int idx) 1771 void bcf_hrec_destroy(bcf_hrec_t *hrec) 1772 1773 #************************************************************************ 1774 # Individual record querying and manipulation routines 1775 #************************************************************************ 1776 1777 # See the description of bcf_hdr_subset() 1778 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) 1779 1780 # bcf_translate() - translate tags ids to be consistent with different header. This function 1781 # is useful when lines from multiple VCF need to be combined. 1782 # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() 1783 # @src_hdr: the source header, used in bcf_read() 1784 # @src_line: line obtained by bcf_read() 1785 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line) 1786 1787 # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc 1788 int bcf_get_variant_types(bcf1_t *rec) 1789 int bcf_get_variant_type(bcf1_t *rec, int ith_allele) 1790 int bcf_is_snp(bcf1_t *v) 1791 1792 # bcf_update_filter() - sets the FILTER column 1793 # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 1794 # @n: Number of filters. If n==0, all filters are removed 1795 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) 1796 1797 # bcf_add_filter() - adds to the FILTER column 1798 # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 1799 # 1800 # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. 1801 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) 1802 1803 # bcf_remove_filter() - removes from the FILTER column 1804 # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 1805 # @pass: when set to 1 and no filters are present, set to PASS 1806 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass) 1807 1808 # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. 1809 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) 1810 1811 # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column 1812 # @alleles: Array of alleles 1813 # @nals: Number of alleles 1814 # @alleles_string: Comma-separated alleles, starting with the REF allele 1815 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) 1816 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) 1817 1818 # bcf_update_id() - sets new ID string 1819 # bcf_add_id() - adds to the ID string checking for duplicates 1820 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) 1821 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) 1822 1823 # bcf_update_info_*() - functions for updating INFO fields 1824 # @hdr: the BCF header 1825 # @line: VCF line to be edited 1826 # @key: the INFO tag to be updated 1827 # @values: pointer to the array of values. Pass NULL to remove the tag. 1828 # @n: number of values in the array. When set to 0, the INFO tag is removed 1829 # 1830 # The @string in bcf_update_info_flag() is optional, @n indicates whether 1831 # the flag is set or removed. 1832 # 1833 # Returns 0 on success or negative value on error. 1834 # 1835 int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) 1836 int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) 1837 int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) 1838 int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) 1839 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) 1840 1841 # bcf_update_format_*() - functions for updating FORMAT fields 1842 # @values: pointer to the array of values, the same number of elements 1843 # is expected for each sample. Missing values must be padded 1844 # with bcf_*_missing or bcf_*_vector_end values. 1845 # @n: number of values in the array. If n==0, existing tag is removed. 1846 # 1847 # The function bcf_update_format_string() is a higher-level (slower) variant of 1848 # bcf_update_format_char(). The former accepts array of \0-terminated strings 1849 # whereas the latter requires that the strings are collapsed into a single array 1850 # of fixed-length strings. In case of strings with variable length, shorter strings 1851 # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() 1852 # are not \0-terminated. 1853 # 1854 # Returns 0 on success or negative value on error. 1855 # 1856 int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) 1857 int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) 1858 int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) 1859 int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n) 1860 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) 1861 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) 1862 1863 # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds 1864 # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained 1865 # from bcf_get_genotypes() below. 1866 uint32_t bcf_gt_phased(uint32_t idx) 1867 uint32_t bcf_gt_unphased(uint32_t idx) 1868 uint32_t bcf_gt_missing 1869 uint32_t bcf_gt_is_missing(uint32_t val) 1870 uint32_t bcf_gt_is_phased(uint32_t idx) 1871 uint32_t bcf_gt_allele(uint32_t val) 1872 1873 # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) 1874 uint32_t bcf_alleles2gt(uint32_t a, uint32_t b) 1875 void bcf_gt2alleles(int igt, int *a, int *b) 1876 1877 # bcf_get_fmt() - returns pointer to FORMAT's field data 1878 # @header: for access to BCF_DT_ID dictionary 1879 # @line: VCF line obtained from vcf_parse1 1880 # @fmt: one of GT,PL,... 1881 # 1882 # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field 1883 # is not available. 1884 # 1885 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) 1886 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) 1887 1888 # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID 1889 # @line: VCF line obtained from vcf_parse1 1890 # @id: The header index for the tag, obtained from bcf_hdr_id2int() 1891 # 1892 # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid 1893 # as their goal is to avoid the header lookup. 1894 # 1895 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id) 1896 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id) 1897 1898 # bcf_get_info_*() - get INFO values, integers or floats 1899 # @hdr: BCF header 1900 # @line: BCF record 1901 # @tag: INFO tag to retrieve 1902 # @dst: *dst is pointer to a memory location, can point to NULL 1903 # @ndst: pointer to the size of allocated memory 1904 # 1905 # Returns negative value on error or the number of written values on 1906 # success. bcf_get_info_string() returns on success the number of 1907 # characters written excluding the null-terminating byte. bcf_get_info_flag() 1908 # returns 1 when flag is set or 0 if not. 1909 # 1910 # List of return codes: 1911 # -1 .. no such INFO tag defined in the header 1912 # -2 .. clash between types defined in the header and encountered in the VCF record 1913 # -3 .. tag is not present in the VCF record 1914 # 1915 int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) 1916 int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) 1917 int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) 1918 int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst) 1919 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) 1920 1921 # bcf_get_format_*() - same as bcf_get_info*() above 1922 # 1923 # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char(). 1924 # see the description of bcf_update_format_string() and bcf_update_format_char() above. 1925 # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays: 1926 # a single block of \0-terminated strings collapsed into a single array and an array of pointers 1927 # to these strings. Both arrays must be cleaned by the user. 1928 # 1929 # Returns negative value on error or the number of written values on success. 1930 # 1931 # Example: 1932 # int ndst = 0; char **dst = NULL 1933 # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 ) 1934 # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]) 1935 # free(dst[0]); free(dst) 1936 # 1937 # Example: 1938 # int ngt, *gt_arr = NULL, ngt_arr = 0 1939 # ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr) 1940 # 1941 int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) 1942 int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) 1943 int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) 1944 int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst) 1945 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) 1946 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) 1947 1948 #************************************************************************ 1949 # Helper functions 1950 #************************************************************************ 1951 1952 # 1953 # bcf_hdr_id2int() - Translates string into numeric ID 1954 # bcf_hdr_int2id() - Translates numeric ID into string 1955 # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE 1956 # @id: tag name, such as: PL, DP, GT, etc. 1957 # 1958 # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies 1959 # fields in BCF records. 1960 # 1961 int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id) 1962 const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id) 1963 1964 # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID 1965 # bcf_hdr_id2name() - Translates numeric ID to sequence name 1966 # 1967 int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) 1968 const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) 1969 const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) 1970 1971 # 1972 # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t 1973 # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT 1974 # @int_id: return value of bcf_hdr_id2int, must be >=0 1975 # 1976 # The returned values are: 1977 # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* 1978 # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields 1979 # bcf_hdr_id2type .. the field type, one of BCF_HT_* 1980 # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* 1981 # 1982 # Notes: Prior to using the macros, the presence of the info should be 1983 # tested with bcf_hdr_idinfo_exists(). 1984 # 1985 int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) 1986 int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) 1987 int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) 1988 int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) 1989 int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) 1990 bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) 1991 1992 void bcf_fmt_array(kstring_t *s, int n, int type, void *data) 1993 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) 1994 1995 void bcf_enc_vchar(kstring_t *s, int l, const char *a) 1996 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) 1997 void bcf_enc_vfloat(kstring_t *s, int n, float *a) 1998 1999 #************************************************************************ 2000 # BCF index 2001 # 2002 # Note that these functions work with BCFs only. See synced_bcf_reader.h 2003 # which provides (amongst other things) an API to work transparently with 2004 # both indexed BCFs and VCFs. 2005 #************************************************************************ 2006 2007 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) 2008 int bcf_index_build(const char *fn, int min_shift) 2009 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) 2010 2011 #******************* 2012 # Typed value I/O * 2013 #****************** 2014 2015 # Note that in contrast with BCFv2.1 specification, HTSlib implementation 2016 # allows missing values in vectors. For integer types, the values 0x80, 2017 # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, 2018 # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of 2019 # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an 2020 # end-of-vector indicator. 2021 # Note that the end-of-vector byte is not part of the vector. 2022 2023 # This trial BCF version (v2.2) is compatible with the VCF specification and 2024 # enables to handle correctly vectors with different ploidy in presence of 2025 # missing values. 2026 2027 int32_t bcf_int8_vector_end 2028 int32_t bcf_int16_vector_end 2029 int32_t bcf_int32_vector_end 2030 int32_t bcf_str_vector_end 2031 int32_t bcf_int8_missing 2032 int32_t bcf_int16_missing 2033 int32_t bcf_int32_missing 2034 int32_t bcf_str_missing 2035 2036 uint32_t bcf_float_vector_end 2037 uint32_t bcf_float_missing 2038 2039 void bcf_float_set(float *ptr, uint32_t value) 2040 void bcf_float_set_vector_end(float *x) 2041 void bcf_float_set_missing(float *x) 2042 2043 int bcf_float_is_missing(float f) 2044 int bcf_float_is_vector_end(float f) 2045 void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) 2046 void bcf_enc_size(kstring_t *s, int size, int type) 2047 int bcf_enc_inttype(long x) 2048 void bcf_enc_int1(kstring_t *s, int32_t x) 2049 int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) 2050 int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) 2051 int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) 2052 2053 # These trivial wrappers are defined only for consistency with other parts of htslib 2054 bcf1_t *bcf_init1() 2055 int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 2056 int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 2057 int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 2058 int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) 2059 void bcf_destroy1(bcf1_t *v) 2060 void bcf_empty1(bcf1_t *v) 2061 int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) 2062 void bcf_clear1(bcf1_t *v) 2063 int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) 2064 2065 # Other nice wrappers 2066 void bcf_itr_destroy(hts_itr_t *iter) 2067 hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) 2068 hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) 2069 int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) 2070 hts_idx_t *bcf_index_load(const char *fn) 2071 const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) 2072 2073 2074# VCF/BCF utility functions 2075cdef extern from "htslib/vcfutils.h" nogil: 2076 struct kbitset_t 2077 2078 # bcf_trim_alleles() - remove ALT alleles unused in genotype fields 2079 # @header: for access to BCF_DT_ID dictionary 2080 # @line: VCF line obtain from vcf_parse1 2081 # 2082 # Returns the number of removed alleles on success or negative 2083 # on error: 2084 # -1 .. some allele index is out of bounds 2085 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) 2086 2087 # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask 2088 # @header: for access to BCF_DT_ID dictionary 2089 # @line: VCF line obtained from vcf_parse1 2090 # @mask: alleles to remove 2091 # 2092 # If you have more than 31 alleles, then the integer bit mask will 2093 # overflow, so use bcf_remove_allele_set instead 2094 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) 2095 2096 # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set 2097 # @header: for access to BCF_DT_ID dictionary 2098 # @line: VCF line obtained from vcf_parse1 2099 # @rm_set: pointer to kbitset_t object with bits set for allele 2100 # indexes to remove 2101 # 2102 # Number=A,R,G INFO and FORMAT fields will be updated accordingly. 2103 void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) 2104 2105 # bcf_calc_ac() - calculate the number of REF and ALT alleles 2106 # @header: for access to BCF_DT_ID dictionary 2107 # @line: VCF line obtained from vcf_parse1 2108 # @ac: array of length line->n_allele 2109 # @which: determine if INFO/AN,AC and indv fields be used 2110 # 2111 # Returns 1 if the call succeeded, or 0 if the value could not 2112 # be determined. 2113 # 2114 # The value of @which determines if existing INFO/AC,AN can be 2115 # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT). 2116 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) 2117 2118 # bcf_gt_type() - determines type of the genotype 2119 # @fmt_ptr: the GT format field as set for example by set_fmt_ptr 2120 # @isample: sample index (starting from 0) 2121 # @ial: index of the 1st non-reference allele (starting from 1) 2122 # @jal: index of the 2nd non-reference allele (starting from 1) 2123 # 2124 # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, 2125 # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial 2126 # is not NULL and the genotype has one or more non-reference 2127 # alleles, $ial will be set. In case of GT_HET_AA, $ial is the 2128 # position of the allele which appeared first in ALT. If $jal is 2129 # not null and the genotype is GT_HET_AA, $jal will be set and is 2130 # the position of the second allele in ALT. 2131 uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation 2132 uint8_t GT_HOM_AA 2133 uint8_t GT_HET_RA 2134 uint8_t GT_HET_AA 2135 uint8_t GT_HAPL_R 2136 uint8_t GT_HAPL_A 2137 uint8_t GT_UNKN 2138 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal) 2139 2140 int bcf_acgt2int(char c) 2141 char bcf_int2acgt(int i) 2142 2143 # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) 2144 # @i,j: allele indexes, 0-based, i<=j 2145 # Returns index to the Number=G diploid array 2146 uint32_t bcf_ij2G(uint32_t i, uint32_t j) 2147 2148 2149cdef extern from "htslib/cram.h" nogil: 2150 2151 enum cram_block_method: 2152 ERROR 2153 RAW 2154 GZIP 2155 BZIP2 2156 LZMA 2157 RANS 2158 RANS0 2159 RANS1 2160 GZIP_RLE 2161 2162 enum cram_content_type: 2163 CT_ERROR 2164 FILE_HEADER 2165 COMPRESSION_HEADER 2166 MAPPED_SLICE 2167 UNMAPPED_SLICE 2168 EXTERNAL 2169 CORE 2170 2171 # Opaque data types, see cram_structs for the fully fledged versions. 2172 ctypedef struct SAM_hdr 2173 ctypedef struct cram_file_def 2174 ctypedef struct cram_fd 2175 ctypedef struct cram_container 2176 ctypedef struct cram_block 2177 ctypedef struct cram_slice 2178 ctypedef struct cram_metrics 2179 ctypedef struct cram_block_slice_hdr 2180 ctypedef struct cram_block_compression_hdr 2181 ctypedef struct refs_t 2182 2183 # Accessor functions 2184 2185 # 2186 #----------------------------------------------------------------------------- 2187 # cram_fd 2188 # 2189 SAM_hdr *cram_fd_get_header(cram_fd *fd) 2190 void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) 2191 2192 int cram_fd_get_version(cram_fd *fd) 2193 void cram_fd_set_version(cram_fd *fd, int vers) 2194 2195 int cram_major_vers(cram_fd *fd) 2196 int cram_minor_vers(cram_fd *fd) 2197 2198 hFILE *cram_fd_get_fp(cram_fd *fd) 2199 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) 2200 2201 # 2202 #----------------------------------------------------------------------------- 2203 # cram_container 2204 # 2205 int32_t cram_container_get_length(cram_container *c) 2206 void cram_container_set_length(cram_container *c, int32_t length) 2207 int32_t cram_container_get_num_blocks(cram_container *c) 2208 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) 2209 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) 2210 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, 2211 int32_t *landmarks) 2212 2213 # Returns true if the container is empty (EOF marker) */ 2214 int cram_container_is_empty(cram_fd *fd) 2215 2216 2217 # 2218 #----------------------------------------------------------------------------- 2219 # cram_block 2220 # 2221 int32_t cram_block_get_content_id(cram_block *b) 2222 int32_t cram_block_get_comp_size(cram_block *b) 2223 int32_t cram_block_get_uncomp_size(cram_block *b) 2224 int32_t cram_block_get_crc32(cram_block *b) 2225 void * cram_block_get_data(cram_block *b) 2226 2227 cram_content_type cram_block_get_content_type(cram_block *b) 2228 2229 void cram_block_set_content_id(cram_block *b, int32_t id) 2230 void cram_block_set_comp_size(cram_block *b, int32_t size) 2231 void cram_block_set_uncomp_size(cram_block *b, int32_t size) 2232 void cram_block_set_crc32(cram_block *b, int32_t crc) 2233 void cram_block_set_data(cram_block *b, void *data) 2234 2235 int cram_block_append(cram_block *b, void *data, int size) 2236 void cram_block_update_size(cram_block *b) 2237 2238 # Offset is known as "size" internally, but it can be confusing. 2239 size_t cram_block_get_offset(cram_block *b) 2240 void cram_block_set_offset(cram_block *b, size_t offset) 2241 2242 # 2243 # Computes the size of a cram block, including the block 2244 # header itself. 2245 # 2246 uint32_t cram_block_size(cram_block *b) 2247 2248 # 2249 # Renumbers RG numbers in a cram compression header. 2250 # 2251 # CRAM stores RG as the Nth number in the header, rather than a 2252 # string holding the ID: tag. This is smaller in space, but means 2253 # "samtools cat" to join files together that contain single but 2254 # different RG lines needs a way of renumbering them. 2255 # 2256 # The file descriptor is expected to be immediately after the 2257 # cram_container structure (ie before the cram compression header). 2258 # Due to the nature of the CRAM format, this needs to read and write 2259 # the blocks itself. Note that there may be multiple slices within 2260 # the container, meaning multiple compression headers to manipulate. 2261 # Changing RG may change the size of the compression header and 2262 # therefore the length field in the container. Hence we rewrite all 2263 # blocks just in case and also emit the adjusted container. 2264 # 2265 # The current implementation can only cope with renumbering a single 2266 # RG (and only then if it is using HUFFMAN or BETA codecs). In 2267 # theory it *may* be possible to renumber multiple RGs if they use 2268 # HUFFMAN to the CORE block or use an external block unshared by any 2269 # other data series. So we have an API that can be upgraded to 2270 # support this, but do not implement it for now. An example 2271 # implementation of RG as an EXTERNAL block would be to find that 2272 # block and rewrite it, returning the number of blocks consumed. 2273 # 2274 # Returns 0 on success; 2275 # -1 if unable to edit; 2276 # -2 on other errors (eg I/O). 2277 # 2278 int cram_transcode_rg(cram_fd *input, cram_fd *output, 2279 cram_container *c, 2280 int nrg, int *in_rg, int *out_rg) 2281 2282 # 2283 # Copies the blocks representing the next num_slice slices from a 2284 # container from 'in' to 'out'. It is expected that the file pointer 2285 # is just after the read of the cram_container and cram compression 2286 # header. 2287 # 2288 # Returns 0 on success 2289 # -1 on failure 2290 # 2291 int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) 2292 2293 # 2294 #----------------------------------------------------------------------------- 2295 # SAM_hdr 2296 # 2297 2298 # Tokenises a SAM header into a hash table. 2299 # 2300 # Also extracts a few bits on specific data types, such as @RG lines. 2301 # 2302 # @return 2303 # Returns a SAM_hdr struct on success (free with sam_hdr_free()) 2304 # NULL on failure 2305 # 2306 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) 2307 2308 2309 # 2310 #----------------------------------------------------------------------------- 2311 # cram_io basics 2312 # 2313 2314 # CRAM blocks - the dynamically growable data block. We have code to 2315 # create, update, (un)compress and read/write. 2316 # 2317 # These are derived from the deflate_interlaced.c blocks, but with the 2318 # CRAM extension of content types and IDs. 2319 # 2320 2321 # Allocates a new cram_block structure with a specified content_type and 2322 # id. 2323 # 2324 # @return 2325 # Returns block pointer on success; 2326 # NULL on failure 2327 # 2328 cram_block *cram_new_block(cram_content_type content_type, 2329 int content_id) 2330 2331 # Reads a block from a cram file. 2332 # 2333 # @return 2334 # Returns cram_block pointer on success; 2335 # NULL on failure 2336 # 2337 cram_block *cram_read_block(cram_fd *fd) 2338 2339 # Writes a CRAM block. 2340 # 2341 # @return 2342 # Returns 0 on success; 2343 # -1 on failure 2344 # 2345 int cram_write_block(cram_fd *fd, cram_block *b) 2346 2347 # Frees a CRAM block, deallocating internal data too. 2348 # 2349 void cram_free_block(cram_block *b) 2350 2351 # Uncompresses a CRAM block, if compressed. 2352 # 2353 # @return 2354 # Returns 0 on success; 2355 # -1 on failure 2356 # 2357 int cram_uncompress_block(cram_block *b) 2358 2359 # Compresses a block. 2360 # 2361 # Compresses a block using one of two different zlib strategies. If we only 2362 # want one choice set strat2 to be -1. 2363 # 2364 # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED 2365 # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is 2366 # significantly faster. 2367 # 2368 # @return 2369 # Returns 0 on success; 2370 # -1 on failure 2371 # 2372 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, 2373 int method, int level) 2374 2375 # Containers 2376 # 2377 2378 # Creates a new container, specifying the maximum number of slices 2379 # and records permitted. 2380 # 2381 # @return 2382 # Returns cram_container ptr on success; 2383 # NULL on failure 2384 # 2385 cram_container *cram_new_container(int nrec, int nslice) 2386 void cram_free_container(cram_container *c) 2387 2388 # Reads a container header. 2389 # 2390 # @return 2391 # Returns cram_container on success; 2392 # NULL on failure or no container left (fd->err == 0). 2393 # 2394 cram_container *cram_read_container(cram_fd *fd) 2395 2396 # Writes a container structure. 2397 # 2398 # @return 2399 # Returns 0 on success; 2400 # -1 on failure 2401 # 2402 int cram_write_container(cram_fd *fd, cram_container *h) 2403 2404 # 2405 # Stores the container structure in dat and returns *size as the 2406 # number of bytes written to dat[]. The input size of dat is also 2407 # held in *size and should be initialised to cram_container_size(c). 2408 # 2409 # Returns 0 on success; 2410 # -1 on failure 2411 # 2412 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) 2413 2414 int cram_container_size(cram_container *c) 2415 2416 # The top-level cram opening, closing and option handling 2417 # 2418 2419 # Opens a CRAM file for read (mode "rb") or write ("wb"). 2420 # 2421 # The filename may be "-" to indicate stdin or stdout. 2422 # 2423 # @return 2424 # Returns file handle on success; 2425 # NULL on failure. 2426 # 2427 cram_fd *cram_open(const char *filename, const char *mode) 2428 2429 # Opens an existing stream for reading or writing. 2430 # 2431 # @return 2432 # Returns file handle on success; 2433 # NULL on failure. 2434 # 2435 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) 2436 2437 # Closes a CRAM file. 2438 # 2439 # @return 2440 # Returns 0 on success; 2441 # -1 on failure 2442 # 2443 int cram_close(cram_fd *fd) 2444 2445 # 2446 # Seek within a CRAM file. 2447 # 2448 # Returns 0 on success 2449 # -1 on failure 2450 # 2451 int cram_seek(cram_fd *fd, off_t offset, int whence) 2452 2453 # 2454 # Flushes a CRAM file. 2455 # Useful for when writing to stdout without wishing to close the stream. 2456 # 2457 # Returns 0 on success 2458 # -1 on failure 2459 # 2460 int cram_flush(cram_fd *fd) 2461 2462 # Checks for end of file on a cram_fd stream. 2463 # 2464 # @return 2465 # Returns 0 if not at end of file 2466 # 1 if we hit an expected EOF (end of range or EOF block) 2467 # 2 for other EOF (end of stream without EOF block) 2468 # 2469 int cram_eof(cram_fd *fd) 2470 2471 # Sets options on the cram_fd. 2472 # 2473 # See CRAM_OPT_* definitions in hts.h. 2474 # Use this immediately after opening. 2475 # 2476 # @return 2477 # Returns 0 on success; 2478 # -1 on failure 2479 # 2480 int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) 2481 2482 # Sets options on the cram_fd. 2483 # 2484 # See CRAM_OPT_* definitions in hts.h. 2485 # Use this immediately after opening. 2486 # 2487 # @return 2488 # Returns 0 on success; 2489 # -1 on failure 2490 # 2491 int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) 2492 2493 # 2494 # Attaches a header to a cram_fd. 2495 # 2496 # This should be used when creating a new cram_fd for writing where 2497 # we have an SAM_hdr already constructed (eg from a file we've read 2498 # in). 2499 # 2500 # @return 2501 # Returns 0 on success; 2502 # -1 on failure 2503 # 2504 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) 2505 2506 # Check if this file has a proper EOF block 2507 # 2508 # @return 2509 # Returns 3 if the file is a version of CRAM that does not contain EOF blocks 2510 # 2 if the file is a stream and thus unseekable 2511 # 1 if the file contains an EOF block 2512 # 0 if the file does not contain an EOF block 2513 # -1 if an error occurred whilst reading the file or we could not seek back to where we were 2514 # 2515 # 2516 int cram_check_EOF(cram_fd *fd) 2517 2518 # As int32_decoded/encode, but from/to blocks instead of cram_fd */ 2519 int int32_put_blk(cram_block *b, int32_t val) 2520 2521 # Deallocates all storage used by a SAM_hdr struct. 2522 # 2523 # This also decrements the header reference count. If after decrementing 2524 # it is still non-zero then the header is assumed to be in use by another 2525 # caller and the free is not done. 2526 # 2527 # This is a synonym for sam_hdr_dec_ref(). 2528 # 2529 void sam_hdr_free(SAM_hdr *hdr) 2530 2531 # Returns the current length of the SAM_hdr in text form. 2532 # 2533 # Call sam_hdr_rebuild() first if editing has taken place. 2534 # 2535 int sam_hdr_length(SAM_hdr *hdr) 2536 2537 # Returns the string form of the SAM_hdr. 2538 # 2539 # Call sam_hdr_rebuild() first if editing has taken place. 2540 # 2541 char *sam_hdr_str(SAM_hdr *hdr) 2542 2543 # Appends a formatted line to an existing SAM header. 2544 # 2545 # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with 2546 # optional new-line. If it contains more than 1 line then multiple lines 2547 # will be added in order. 2548 # 2549 # Len is the length of the text data, or 0 if unknown (in which case 2550 # it should be null terminated). 2551 # 2552 # @return 2553 # Returns 0 on success; 2554 # -1 on failure 2555 # 2556 2557 # Add an @PG line. 2558 # 2559 # If we wish complete control over this use sam_hdr_add() directly. This 2560 # function uses that, but attempts to do a lot of tedious house work for 2561 # you too. 2562 # 2563 # - It will generate a suitable ID if the supplied one clashes. 2564 # - It will generate multiple @PG records if we have multiple PG chains. 2565 # 2566 # Call it as per sam_hdr_add() with a series of key,value pairs ending 2567 # in NULL. 2568 # 2569 # @return 2570 # Returns 0 on success; 2571 # -1 on failure 2572 # 2573 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) 2574 2575 # 2576 # A function to help with construction of CL tags in @PG records. 2577 # Takes an argc, argv pair and returns a single space-separated string. 2578 # This string should be deallocated by the calling function. 2579 # 2580 # @return 2581 # Returns malloced char * on success; 2582 # NULL on failure 2583 # 2584 char *stringify_argv(int argc, char *argv[]) 2585 2586 # 2587 # Returns the refs_t structure used by a cram file handle. 2588 # 2589 # This may be used in conjunction with option CRAM_OPT_SHARED_REF to 2590 # share reference memory between multiple file handles. 2591 # 2592 # @return 2593 # Returns NULL if none exists or the file handle is not a CRAM file. 2594 # 2595 refs_t *cram_get_refs(htsFile *fd) 2596 2597 2598cdef class HTSFile(object): 2599 cdef htsFile *htsfile # pointer to htsFile structure 2600 cdef int64_t start_offset # BGZF offset of first record 2601 2602 cdef readonly object filename # filename as supplied by user 2603 cdef readonly object mode # file opening mode 2604 cdef readonly object threads # number of threads to use 2605 cdef readonly object index_filename # filename of index, if supplied by user 2606 2607 cdef readonly bint is_stream # Is htsfile a non-seekable stream 2608 cdef readonly bint is_remote # Is htsfile a remote stream 2609 cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh 2610 2611 cdef htsFile *_open_htsfile(self) except? NULL 2612