1# cython: embedsignature=True 2# cython: profile=True 3############################################################################### 4############################################################################### 5# Cython wrapper for SAM/BAM/CRAM files based on htslib 6############################################################################### 7# The principal classes defined in this module are: 8# 9# class AlignedSegment an aligned segment (read) 10# 11# class PileupColumn a collection of segments (PileupRead) aligned to 12# a particular genomic position. 13# 14# class PileupRead an AlignedSegment aligned to a particular genomic 15# position. Contains additional attributes with respect 16# to this. 17# 18# Additionally this module defines numerous additional classes that are part 19# of the internal API. These are: 20# 21# Various iterator classes to iterate over alignments in sequential (IteratorRow) 22# or in a stacked fashion (IteratorColumn): 23# 24# class IteratorRow 25# class IteratorRowRegion 26# class IteratorRowHead 27# class IteratorRowAll 28# class IteratorRowAllRefs 29# class IteratorRowSelection 30# 31############################################################################### 32# 33# The MIT License 34# 35# Copyright (c) 2015 Andreas Heger 36# 37# Permission is hereby granted, free of charge, to any person obtaining a 38# copy of this software and associated documentation files (the "Software"), 39# to deal in the Software without restriction, including without limitation 40# the rights to use, copy, modify, merge, publish, distribute, sublicense, 41# and/or sell copies of the Software, and to permit persons to whom the 42# Software is furnished to do so, subject to the following conditions: 43# 44# The above copyright notice and this permission notice shall be included in 45# all copies or substantial portions of the Software. 46# 47# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 48# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 49# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 50# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 51# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 52# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 53# DEALINGS IN THE SOFTWARE. 54# 55############################################################################### 56import re 57import array 58import json 59import string 60import ctypes 61import struct 62 63cimport cython 64from cpython cimport array as c_array 65from cpython.version cimport PY_MAJOR_VERSION 66from cpython cimport PyBytes_FromStringAndSize 67from libc.string cimport strchr 68from cpython cimport array as c_array 69from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ 70 INT8_MAX, INT16_MAX, INT32_MAX, \ 71 UINT8_MAX, UINT16_MAX, UINT32_MAX 72 73from pysam.libchtslib cimport HTS_IDX_NOCOOR 74from pysam.libcutils cimport force_bytes, force_str, \ 75 charptr_to_str, charptr_to_bytes 76from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \ 77 array_to_qualitystring 78 79# Constants for binary tag conversion 80cdef char * htslib_types = 'cCsSiIf' 81cdef char * parray_types = 'bBhHiIf' 82 83cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 84 85# translation tables 86 87# cigar code to character and vice versa 88cdef char* CODE2CIGAR= "MIDNSHP=XB" 89cdef int NCIGAR_CODES = 10 90 91if IS_PYTHON3: 92 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) 93 maketrans = str.maketrans 94else: 95 CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) 96 maketrans = string.maketrans 97 98CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") 99 100# names for keys in dictionary representation of an AlignedSegment 101KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar", 102 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"] 103 104##################################################################### 105# C multiplication with wrapping around 106cdef inline uint32_t c_mul(uint32_t a, uint32_t b): 107 return (a * b) & 0xffffffff 108 109 110cdef inline uint8_t tolower(uint8_t ch): 111 if ch >= 65 and ch <= 90: 112 return ch + 32 113 else: 114 return ch 115 116 117cdef inline uint8_t toupper(uint8_t ch): 118 if ch >= 97 and ch <= 122: 119 return ch - 32 120 else: 121 return ch 122 123 124cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): 125 if ch == '=': 126 if bam_is_rev(b): 127 return ',' 128 else: 129 return '.' 130 else: 131 if bam_is_rev(b): 132 return tolower(ch) 133 else: 134 return toupper(ch) 135 136 137cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold): 138 cdef uint32_t c 139 if p.qpos < p.b.core.l_qseq: 140 c = bam_get_qual(p.b)[p.qpos] 141 else: 142 c = 0 143 if c < threshold: 144 return True 145 return False 146 147 148cdef inline char map_typecode_htslib_to_python(uint8_t s): 149 """map an htslib typecode to the corresponding python typecode 150 to be used in the struct or array modules.""" 151 152 # map type from htslib to python array 153 cdef char * f = strchr(htslib_types, s) 154 155 if f == NULL: 156 return 0 157 return parray_types[f - htslib_types] 158 159 160cdef inline uint8_t map_typecode_python_to_htslib(char s): 161 """determine value type from type code of array""" 162 cdef char * f = strchr(parray_types, s) 163 if f == NULL: 164 return 0 165 return htslib_types[f - parray_types] 166 167 168cdef inline void update_bin(bam1_t * src): 169 if src.core.flag & BAM_FUNMAP: 170 # treat alignment as length of 1 for unmapped reads 171 src.core.bin = hts_reg2bin( 172 src.core.pos, 173 src.core.pos + 1, 174 14, 175 5) 176 elif pysam_get_n_cigar(src): 177 src.core.bin = hts_reg2bin( 178 src.core.pos, 179 bam_endpos(src), 180 14, 181 5) 182 else: 183 src.core.bin = hts_reg2bin( 184 src.core.pos, 185 src.core.pos + 1, 186 14, 187 5) 188 189 190# optional tag data manipulation 191cdef convert_binary_tag(uint8_t * tag): 192 """return bytesize, number of values and array of values 193 in aux_data memory location pointed to by tag.""" 194 cdef uint8_t auxtype 195 cdef uint8_t byte_size 196 cdef int32_t nvalues 197 # get byte size 198 auxtype = tag[0] 199 byte_size = aux_type2size(auxtype) 200 tag += 1 201 # get number of values in array 202 nvalues = (<int32_t*>tag)[0] 203 tag += 4 204 205 # define python array 206 cdef c_array.array c_values = array.array( 207 chr(map_typecode_htslib_to_python(auxtype))) 208 c_array.resize(c_values, nvalues) 209 210 # copy data 211 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size) 212 213 # no need to check for endian-ness as bam1_core_t fields 214 # and aux_data are in host endian-ness. See sam.c and calls 215 # to swap_data 216 return byte_size, nvalues, c_values 217 218 219cdef inline uint8_t get_tag_typecode(value, value_type=None): 220 """guess type code for a *value*. If *value_type* is None, the type 221 code will be inferred based on the Python type of *value* 222 223 """ 224 # 0 is unknown typecode 225 cdef char typecode = 0 226 227 if value_type is None: 228 if isinstance(value, int): 229 if value < 0: 230 if value >= INT8_MIN: 231 typecode = 'c' 232 elif value >= INT16_MIN: 233 typecode = 's' 234 elif value >= INT32_MIN: 235 typecode = 'i' 236 # unsigned ints 237 else: 238 if value <= UINT8_MAX: 239 typecode = 'C' 240 elif value <= UINT16_MAX: 241 typecode = 'S' 242 elif value <= UINT32_MAX: 243 typecode = 'I' 244 elif isinstance(value, float): 245 typecode = 'f' 246 elif isinstance(value, str): 247 typecode = 'Z' 248 elif isinstance(value, bytes): 249 typecode = 'Z' 250 elif isinstance(value, array.array) or \ 251 isinstance(value, list) or \ 252 isinstance(value, tuple): 253 typecode = 'B' 254 else: 255 if value_type in 'aAsSIcCZidfH': 256 typecode = force_bytes(value_type)[0] 257 258 return typecode 259 260 261cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): 262 '''returns the value typecode of a value. 263 264 If max is specified, the appropriate type is returned for a range 265 where value is the minimum. 266 267 Note that this method returns types from the extended BAM alphabet 268 of types that includes tags that are not part of the SAM 269 specification. 270 ''' 271 272 273 cdef uint8_t typecode 274 275 t = type(value) 276 277 if t is float: 278 typecode = 'f' 279 elif t is int: 280 if max_value is None: 281 max_value = value 282 if min_value is None: 283 min_value = value 284 # signed ints 285 if min_value < 0: 286 if min_value >= INT8_MIN and max_value <= INT8_MAX: 287 typecode = 'c' 288 elif min_value >= INT16_MIN and max_value <= INT16_MAX: 289 typecode = 's' 290 elif min_value >= INT32_MIN or max_value <= INT32_MAX: 291 typecode = 'i' 292 else: 293 raise ValueError( 294 "at least one signed integer out of range of " 295 "BAM/SAM specification") 296 # unsigned ints 297 else: 298 if max_value <= UINT8_MAX: 299 typecode = 'C' 300 elif max_value <= UINT16_MAX: 301 typecode = 'S' 302 elif max_value <= UINT32_MAX: 303 typecode = 'I' 304 else: 305 raise ValueError( 306 "at least one integer out of range of BAM/SAM specification") 307 else: 308 # Note: hex strings (H) are not supported yet 309 if t is not bytes: 310 value = value.encode('ascii') 311 if len(value) == 1: 312 typecode = 'A' 313 else: 314 typecode = 'Z' 315 316 return typecode 317 318 319# mapping python array.array and htslib typecodes to struct typecodes 320DATATYPE2FORMAT = { 321 ord('c'): ('b', 1), 322 ord('C'): ('B', 1), 323 ord('s'): ('h', 2), 324 ord('S'): ('H', 2), 325 ord('i'): ('i', 4), 326 ord('I'): ('I', 4), 327 ord('f'): ('f', 4), 328 ord('d'): ('d', 8), 329 ord('A'): ('c', 1), 330 ord('a'): ('c', 1)} 331 332 333cdef inline pack_tags(tags): 334 """pack a list of tags. Each tag is a tuple of (tag, tuple). 335 336 Values are packed into the most space efficient data structure 337 possible unless the tag contains a third field with the typecode. 338 339 Returns a format string and the associated list of arguments to be 340 used in a call to struct.pack_into. 341 """ 342 fmts, args = ["<"], [] 343 344 # htslib typecode 345 cdef uint8_t typecode 346 for tag in tags: 347 348 if len(tag) == 2: 349 pytag, value = tag 350 valuetype = None 351 elif len(tag) == 3: 352 pytag, value, valuetype = tag 353 else: 354 raise ValueError("malformatted tag: %s" % str(tag)) 355 356 if valuetype is None: 357 typecode = 0 358 else: 359 # only first character in valuecode matters 360 if IS_PYTHON3: 361 typecode = force_bytes(valuetype)[0] 362 else: 363 typecode = ord(valuetype[0]) 364 365 pytag = force_bytes(pytag) 366 pytype = type(value) 367 368 if pytype is tuple or pytype is list: 369 # binary tags from tuples or lists 370 if not typecode: 371 # automatically determine value type - first value 372 # determines type. If there is a mix of types, the 373 # result is undefined. 374 typecode = get_btag_typecode(min(value), 375 min_value=min(value), 376 max_value=max(value)) 377 378 if typecode not in DATATYPE2FORMAT: 379 raise ValueError("invalid value type '{}'".format(chr(typecode))) 380 381 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0]) 382 args.extend([pytag[:2], 383 ord("B"), 384 typecode, 385 len(value)] + list(value)) 386 387 elif isinstance(value, array.array): 388 # binary tags from arrays 389 if typecode == 0: 390 typecode = map_typecode_python_to_htslib(ord(value.typecode)) 391 392 if typecode == 0: 393 raise ValueError("unsupported type code '{}'".format(value.typecode)) 394 395 if typecode not in DATATYPE2FORMAT: 396 raise ValueError("invalid value type '{}' ({})".format(chr(typecode), array.typecode)) 397 398 # use array.tostring() to retrieve byte representation and 399 # save as bytes 400 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1]) 401 if IS_PYTHON3: 402 args.extend([pytag[:2], 403 ord("B"), 404 typecode, 405 len(value), 406 value.tobytes()]) 407 else: 408 args.extend([pytag[:2], 409 ord("B"), 410 typecode, 411 len(value), 412 force_bytes(value.tostring())]) 413 414 else: 415 if typecode == 0: 416 typecode = get_tag_typecode(value) 417 if typecode == 0: 418 raise ValueError("could not deduce typecode for value {}".format(value)) 419 420 if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H': 421 value = force_bytes(value) 422 423 if typecode == "a": 424 typecode = 'A' 425 426 if typecode == 'Z' or typecode == 'H': 427 datafmt = "2sB%is" % (len(value)+1) 428 else: 429 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0] 430 431 args.extend([pytag[:2], 432 typecode, 433 value]) 434 435 fmts.append(datafmt) 436 437 return "".join(fmts), args 438 439 440cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src): 441 """return query length computed from CIGAR alignment. 442 443 Length ignores hard-clipped bases. 444 445 Return 0 if there is no CIGAR alignment. 446 """ 447 448 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 449 450 if cigar_p == NULL: 451 return 0 452 453 cdef uint32_t k, qpos 454 cdef int op 455 qpos = 0 456 457 for k from 0 <= k < pysam_get_n_cigar(src): 458 op = cigar_p[k] & BAM_CIGAR_MASK 459 460 if op == BAM_CMATCH or \ 461 op == BAM_CINS or \ 462 op == BAM_CSOFT_CLIP or \ 463 op == BAM_CEQUAL or \ 464 op == BAM_CDIFF: 465 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT 466 467 return qpos 468 469 470cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src): 471 """return query length computed from CIGAR alignment. 472 473 Length includes hard-clipped bases. 474 475 Return 0 if there is no CIGAR alignment. 476 """ 477 478 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 479 480 if cigar_p == NULL: 481 return 0 482 483 cdef uint32_t k, qpos 484 cdef int op 485 qpos = 0 486 487 for k from 0 <= k < pysam_get_n_cigar(src): 488 op = cigar_p[k] & BAM_CIGAR_MASK 489 490 if op == BAM_CMATCH or \ 491 op == BAM_CINS or \ 492 op == BAM_CSOFT_CLIP or \ 493 op == BAM_CHARD_CLIP or \ 494 op == BAM_CEQUAL or \ 495 op == BAM_CDIFF: 496 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT 497 498 return qpos 499 500 501cdef inline int32_t getQueryStart(bam1_t *src) except -1: 502 cdef uint32_t * cigar_p 503 cdef uint32_t start_offset = 0 504 cdef uint32_t k, op 505 506 cigar_p = pysam_bam_get_cigar(src); 507 for k from 0 <= k < pysam_get_n_cigar(src): 508 op = cigar_p[k] & BAM_CIGAR_MASK 509 if op == BAM_CHARD_CLIP: 510 if start_offset != 0 and start_offset != src.core.l_qseq: 511 raise ValueError('Invalid clipping in CIGAR string') 512 elif op == BAM_CSOFT_CLIP: 513 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT 514 else: 515 break 516 517 return start_offset 518 519 520cdef inline int32_t getQueryEnd(bam1_t *src) except -1: 521 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 522 cdef uint32_t end_offset = src.core.l_qseq 523 cdef uint32_t k, op 524 525 # if there is no sequence, compute length from cigar string 526 if end_offset == 0: 527 for k from 0 <= k < pysam_get_n_cigar(src): 528 op = cigar_p[k] & BAM_CIGAR_MASK 529 if op == BAM_CMATCH or \ 530 op == BAM_CINS or \ 531 op == BAM_CEQUAL or \ 532 op == BAM_CDIFF or \ 533 (op == BAM_CSOFT_CLIP and end_offset == 0): 534 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT 535 else: 536 # walk backwards in cigar string 537 for k from pysam_get_n_cigar(src) > k >= 1: 538 op = cigar_p[k] & BAM_CIGAR_MASK 539 if op == BAM_CHARD_CLIP: 540 if end_offset != src.core.l_qseq: 541 raise ValueError('Invalid clipping in CIGAR string') 542 elif op == BAM_CSOFT_CLIP: 543 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT 544 else: 545 break 546 547 return end_offset 548 549 550cdef inline bytes getSequenceInRange(bam1_t *src, 551 uint32_t start, 552 uint32_t end): 553 """return python string of the sequence in a bam1_t object. 554 """ 555 556 cdef uint8_t * p 557 cdef uint32_t k 558 cdef char * s 559 560 if not src.core.l_qseq: 561 return None 562 563 seq = PyBytes_FromStringAndSize(NULL, end - start) 564 s = <char*>seq 565 p = pysam_bam_get_seq(src) 566 567 for k from start <= k < end: 568 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) 569 # note: do not use string literal as it will be a python string 570 s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf] 571 572 return charptr_to_bytes(seq) 573 574 575cdef inline object getQualitiesInRange(bam1_t *src, 576 uint32_t start, 577 uint32_t end): 578 """return python array of quality values from a bam1_t object""" 579 580 cdef uint8_t * p 581 cdef uint32_t k 582 583 p = pysam_bam_get_qual(src) 584 if p[0] == 0xff: 585 return None 586 587 # 'B': unsigned char 588 cdef c_array.array result = array.array('B', [0]) 589 c_array.resize(result, end - start) 590 591 # copy data 592 memcpy(result.data.as_voidptr, <void*>&p[start], end - start) 593 594 return result 595 596 597##################################################################### 598## factory methods for instantiating extension classes 599cdef class AlignedSegment 600cdef AlignedSegment makeAlignedSegment(bam1_t *src, 601 AlignmentHeader header): 602 '''return an AlignedSegment object constructed from `src`''' 603 # note that the following does not call __init__ 604 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) 605 dest._delegate = bam_dup1(src) 606 dest.header = header 607 return dest 608 609 610cdef class PileupColumn 611cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp, 612 int tid, 613 int pos, 614 int n_pu, 615 uint32_t min_base_quality, 616 char * reference_sequence, 617 AlignmentHeader header): 618 '''return a PileupColumn object constructed from pileup in `plp` and 619 setting additional attributes. 620 621 ''' 622 # note that the following does not call __init__ 623 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) 624 dest.header = header 625 dest.plp = plp 626 dest.tid = tid 627 dest.pos = pos 628 dest.n_pu = n_pu 629 dest.min_base_quality = min_base_quality 630 dest.reference_sequence = reference_sequence 631 dest.buf.l = dest.buf.m = 0 632 dest.buf.s = NULL 633 634 return dest 635 636 637cdef class PileupRead 638cdef PileupRead makePileupRead(const bam_pileup1_t *src, 639 AlignmentHeader header): 640 '''return a PileupRead object construted from a bam_pileup1_t * object.''' 641 # note that the following does not call __init__ 642 cdef PileupRead dest = PileupRead.__new__(PileupRead) 643 dest._alignment = makeAlignedSegment(src.b, header) 644 dest._qpos = src.qpos 645 dest._indel = src.indel 646 dest._level = src.level 647 dest._is_del = src.is_del 648 dest._is_head = src.is_head 649 dest._is_tail = src.is_tail 650 dest._is_refskip = src.is_refskip 651 return dest 652 653 654cdef inline uint32_t get_alignment_length(bam1_t *src): 655 cdef uint32_t k = 0 656 cdef uint32_t l = 0 657 if src == NULL: 658 return 0 659 cdef uint32_t * cigar_p = bam_get_cigar(src) 660 if cigar_p == NULL: 661 return 0 662 cdef int op 663 cdef uint32_t n = pysam_get_n_cigar(src) 664 for k from 0 <= k < n: 665 op = cigar_p[k] & BAM_CIGAR_MASK 666 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP: 667 continue 668 l += cigar_p[k] >> BAM_CIGAR_SHIFT 669 return l 670 671 672cdef inline uint32_t get_md_reference_length(char * md_tag): 673 cdef int l = 0 674 cdef int md_idx = 0 675 cdef int nmatches = 0 676 677 while md_tag[md_idx] != 0: 678 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: 679 nmatches *= 10 680 nmatches += md_tag[md_idx] - 48 681 md_idx += 1 682 continue 683 else: 684 l += nmatches 685 nmatches = 0 686 if md_tag[md_idx] == '^': 687 md_idx += 1 688 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: 689 md_idx += 1 690 l += 1 691 else: 692 md_idx += 1 693 l += 1 694 695 l += nmatches 696 return l 697 698# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ... 699cdef inline bytes build_alignment_sequence(bam1_t * src): 700 """return expanded sequence from MD tag. 701 702 The sequence includes substitutions and both insertions in the 703 reference as well as deletions to the reference sequence. Combine 704 with the cigar string to reconstitute the query or the reference 705 sequence. 706 707 Positions corresponding to `N` (skipped region from the reference) 708 in the CIGAR string will not appear in the returned sequence. The 709 MD should correspondingly not contain these. Thus proper tags are:: 710 711 Deletion from the reference: cigar=5M1D5M MD=5^C5 712 Skipped region from reference: cigar=5M1N5M MD=10 713 714 Returns 715 ------- 716 717 None, if no MD tag is present. 718 719 """ 720 if src == NULL: 721 return None 722 723 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") 724 if md_tag_ptr == NULL: 725 return None 726 727 cdef uint32_t start = getQueryStart(src) 728 cdef uint32_t end = getQueryEnd(src) 729 # get read sequence, taking into account soft-clipping 730 r = getSequenceInRange(src, start, end) 731 cdef char * read_sequence = r 732 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 733 if cigar_p == NULL: 734 return None 735 736 cdef uint32_t r_idx = 0 737 cdef int op 738 cdef uint32_t k, i, l, x 739 cdef int nmatches = 0 740 cdef int s_idx = 0 741 742 cdef uint32_t max_len = get_alignment_length(src) 743 if max_len == 0: 744 raise ValueError("could not determine alignment length") 745 746 cdef char * s = <char*>calloc(max_len + 1, sizeof(char)) 747 if s == NULL: 748 raise ValueError( 749 "could not allocate sequence of length %i" % max_len) 750 751 for k from 0 <= k < pysam_get_n_cigar(src): 752 op = cigar_p[k] & BAM_CIGAR_MASK 753 l = cigar_p[k] >> BAM_CIGAR_SHIFT 754 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 755 for i from 0 <= i < l: 756 s[s_idx] = read_sequence[r_idx] 757 r_idx += 1 758 s_idx += 1 759 elif op == BAM_CDEL: 760 for i from 0 <= i < l: 761 s[s_idx] = '-' 762 s_idx += 1 763 elif op == BAM_CREF_SKIP: 764 pass 765 elif op == BAM_CINS: 766 for i from 0 <= i < l: 767 # encode insertions into reference as lowercase 768 s[s_idx] = read_sequence[r_idx] + 32 769 r_idx += 1 770 s_idx += 1 771 elif op == BAM_CSOFT_CLIP: 772 pass 773 elif op == BAM_CHARD_CLIP: 774 pass # advances neither 775 elif op == BAM_CPAD: 776 raise NotImplementedError( 777 "Padding (BAM_CPAD, 6) is currently not supported. " 778 "Please implement. Sorry about that.") 779 780 cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr) 781 cdef int md_idx = 0 782 cdef char c 783 s_idx = 0 784 785 # Check if MD tag is valid by matching CIGAR length to MD tag defined length 786 # Insertions would be in addition to what is described by MD, so we calculate 787 # the number of insertions separately. 788 cdef int insertions = 0 789 790 while s[s_idx] != 0: 791 if s[s_idx] >= 'a': 792 insertions += 1 793 s_idx += 1 794 s_idx = 0 795 796 cdef uint32_t md_len = get_md_reference_length(md_tag) 797 if md_len + insertions > max_len: 798 raise AssertionError( 799 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format( 800 md_len, max_len, insertions)) 801 802 while md_tag[md_idx] != 0: 803 # c is numerical 804 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: 805 nmatches *= 10 806 nmatches += md_tag[md_idx] - 48 807 md_idx += 1 808 continue 809 else: 810 # save matches up to this point, skipping insertions 811 for x from 0 <= x < nmatches: 812 while s[s_idx] >= 'a': 813 s_idx += 1 814 s_idx += 1 815 while s[s_idx] >= 'a': 816 s_idx += 1 817 818 r_idx += nmatches 819 nmatches = 0 820 if md_tag[md_idx] == '^': 821 md_idx += 1 822 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: 823 # assert s[s_idx] == '-' 824 s[s_idx] = md_tag[md_idx] 825 s_idx += 1 826 md_idx += 1 827 else: 828 # save mismatch 829 # enforce lower case 830 c = md_tag[md_idx] 831 if c <= 90: 832 c += 32 833 s[s_idx] = c 834 s_idx += 1 835 r_idx += 1 836 md_idx += 1 837 838 # save matches up to this point, skipping insertions 839 for x from 0 <= x < nmatches: 840 while s[s_idx] >= 'a': 841 s_idx += 1 842 s_idx += 1 843 while s[s_idx] >= 'a': 844 s_idx += 1 845 846 seq = PyBytes_FromStringAndSize(s, s_idx) 847 free(s) 848 849 return seq 850 851 852cdef inline bytes build_reference_sequence(bam1_t * src): 853 """return the reference sequence in the region that is covered by the 854 alignment of the read to the reference. 855 856 This method requires the MD tag to be set. 857 858 """ 859 cdef uint32_t k, i, l 860 cdef int op 861 cdef int s_idx = 0 862 ref_seq = build_alignment_sequence(src) 863 if ref_seq is None: 864 raise ValueError("MD tag not present") 865 866 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char)) 867 if s == NULL: 868 raise ValueError( 869 "could not allocate sequence of length %i" % len(ref_seq)) 870 871 cdef char * cref_seq = ref_seq 872 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 873 cdef uint32_t r_idx = 0 874 for k from 0 <= k < pysam_get_n_cigar(src): 875 op = cigar_p[k] & BAM_CIGAR_MASK 876 l = cigar_p[k] >> BAM_CIGAR_SHIFT 877 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 878 for i from 0 <= i < l: 879 s[s_idx] = cref_seq[r_idx] 880 r_idx += 1 881 s_idx += 1 882 elif op == BAM_CDEL: 883 for i from 0 <= i < l: 884 s[s_idx] = cref_seq[r_idx] 885 r_idx += 1 886 s_idx += 1 887 elif op == BAM_CREF_SKIP: 888 pass 889 elif op == BAM_CINS: 890 r_idx += l 891 elif op == BAM_CSOFT_CLIP: 892 pass 893 elif op == BAM_CHARD_CLIP: 894 pass # advances neither 895 elif op == BAM_CPAD: 896 raise NotImplementedError( 897 "Padding (BAM_CPAD, 6) is currently not supported. " 898 "Please implement. Sorry about that.") 899 900 seq = PyBytes_FromStringAndSize(s, s_idx) 901 free(s) 902 903 return seq 904 905 906cdef class AlignedSegment: 907 '''Class representing an aligned segment. 908 909 This class stores a handle to the samtools C-structure representing 910 an aligned read. Member read access is forwarded to the C-structure 911 and converted into python objects. This implementation should be fast, 912 as only the data needed is converted. 913 914 For write access, the C-structure is updated in-place. This is 915 not the most efficient way to build BAM entries, as the variable 916 length data is concatenated and thus needs to be resized if 917 a field is updated. Furthermore, the BAM entry might be 918 in an inconsistent state. 919 920 One issue to look out for is that the sequence should always 921 be set *before* the quality scores. Setting the sequence will 922 also erase any quality scores that were set previously. 923 924 Parameters 925 ---------- 926 927 header: 928 :class:`~pysam.AlignmentHeader` object to map numerical 929 identifiers to chromosome names. If not given, an empty 930 header is created. 931 ''' 932 933 # Now only called when instances are created from Python 934 def __init__(self, AlignmentHeader header=None): 935 # see bam_init1 936 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t)) 937 if self._delegate == NULL: 938 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t))) 939 # allocate some memory. If size is 0, calloc does not return a 940 # pointer that can be passed to free() so allocate 40 bytes 941 # for a new read 942 self._delegate.m_data = 40 943 self._delegate.data = <uint8_t *>calloc( 944 self._delegate.m_data, 1) 945 if self._delegate.data == NULL: 946 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data)) 947 self._delegate.l_data = 0 948 # set some data to make read approximately legit. 949 # Note, SAM writing fails with q_name of length 0 950 self._delegate.core.l_qname = 0 951 self._delegate.core.tid = -1 952 self._delegate.core.pos = -1 953 self._delegate.core.mtid = -1 954 self._delegate.core.mpos = -1 955 956 # caching for selected fields 957 self.cache_query_qualities = None 958 self.cache_query_alignment_qualities = None 959 self.cache_query_sequence = None 960 self.cache_query_alignment_sequence = None 961 962 self.header = header 963 964 def __dealloc__(self): 965 bam_destroy1(self._delegate) 966 967 def __str__(self): 968 """return string representation of alignment. 969 970 The representation is an approximate :term:`SAM` format, because 971 an aligned read might not be associated with a :term:`AlignmentFile`. 972 As a result :term:`tid` is shown instead of the reference name. 973 Similarly, the tags field is returned in its parsed state. 974 975 To get a valid SAM record, use :meth:`to_string`. 976 """ 977 # sam-parsing is done in sam.c/bam_format1_core which 978 # requires a valid header. 979 return "\t".join(map(str, (self.query_name, 980 self.flag, 981 "#%d" % self.reference_id if self.reference_id >= 0 else "*", 982 self.reference_start + 1, 983 self.mapping_quality, 984 self.cigarstring, 985 "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*", 986 self.next_reference_start + 1, 987 self.template_length, 988 self.query_sequence, 989 self.query_qualities, 990 self.tags))) 991 992 def __copy__(self): 993 return makeAlignedSegment(self._delegate, self.header) 994 995 def __deepcopy__(self, memo): 996 return makeAlignedSegment(self._delegate, self.header) 997 998 def compare(self, AlignedSegment other): 999 '''return -1,0,1, if contents in this are binary 1000 <,=,> to *other* 1001 ''' 1002 1003 # avoid segfault when other equals None 1004 if other is None: 1005 return -1 1006 1007 cdef int retval, x 1008 cdef bam1_t *t 1009 cdef bam1_t *o 1010 1011 t = self._delegate 1012 o = other._delegate 1013 1014 # uncomment for debugging purposes 1015 # cdef unsigned char * oo, * tt 1016 # tt = <unsigned char*>(&t.core) 1017 # oo = <unsigned char*>(&o.core) 1018 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] 1019 # tt = <unsigned char*>(t.data) 1020 # oo = <unsigned char*>(o.data) 1021 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) 1022 1023 # Fast-path test for object identity 1024 if t == o: 1025 return 0 1026 1027 cdef uint8_t *a = <uint8_t*>&t.core 1028 cdef uint8_t *b = <uint8_t*>&o.core 1029 1030 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) 1031 if retval: 1032 return retval 1033 1034 # cmp(t.l_data, o.l_data) 1035 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data) 1036 if retval: 1037 return retval 1038 return memcmp(t.data, o.data, t.l_data) 1039 1040 def __richcmp__(self, AlignedSegment other, int op): 1041 if op == 2: # == operator 1042 return self.compare(other) == 0 1043 elif op == 3: # != operator 1044 return self.compare(other) != 0 1045 else: 1046 return NotImplemented 1047 1048 def __hash__(self): 1049 cdef bam1_t * src = self._delegate 1050 cdef int x 1051 1052 # see http://effbot.org/zone/python-hash.htm 1053 cdef uint8_t * c = <uint8_t *>&src.core 1054 cdef uint32_t hash_value = c[0] 1055 for x from 1 <= x < sizeof(bam1_core_t): 1056 hash_value = c_mul(hash_value, 1000003) ^ c[x] 1057 c = <uint8_t *>src.data 1058 for x from 0 <= x < src.l_data: 1059 hash_value = c_mul(hash_value, 1000003) ^ c[x] 1060 1061 return hash_value 1062 1063 cpdef to_string(self): 1064 """returns a string representation of the aligned segment. 1065 1066 The output format is valid SAM format if a header is associated 1067 with the AlignedSegment. 1068 """ 1069 cdef kstring_t line 1070 line.l = line.m = 0 1071 line.s = NULL 1072 1073 if self.header: 1074 if sam_format1(self.header.ptr, self._delegate, &line) < 0: 1075 if line.m: 1076 free(line.s) 1077 raise ValueError('sam_format failed') 1078 else: 1079 raise NotImplementedError("todo") 1080 1081 ret = force_str(line.s[:line.l]) 1082 1083 if line.m: 1084 free(line.s) 1085 1086 return ret 1087 1088 @classmethod 1089 def fromstring(cls, sam, AlignmentHeader header): 1090 """parses a string representation of the aligned segment. 1091 1092 The input format should be valid SAM format. 1093 1094 Parameters 1095 ---------- 1096 sam: 1097 :term:`SAM` formatted string 1098 1099 """ 1100 cdef AlignedSegment dest = cls.__new__(cls) 1101 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t)) 1102 dest.header = header 1103 1104 cdef kstring_t line 1105 line.l = line.m = len(sam) 1106 _sam = force_bytes(sam) 1107 line.s = _sam 1108 1109 sam_parse1(&line, dest.header.ptr, dest._delegate) 1110 1111 return dest 1112 1113 cpdef tostring(self, htsfile=None): 1114 """deprecated, use :meth:`to_string()` instead. 1115 1116 Parameters 1117 ---------- 1118 1119 htsfile: 1120 (deprecated) AlignmentFile object to map numerical 1121 identifiers to chromosome names. This parameter is present 1122 for backwards compatibility and ignored. 1123 """ 1124 1125 return self.to_string() 1126 1127 def to_dict(self): 1128 """returns a json representation of the aligned segment. 1129 1130 Field names are abbreviated versions of the class attributes. 1131 """ 1132 # let htslib do the string conversions, but treat optional field properly as list 1133 vals = self.to_string().split("\t") 1134 n = len(KEY_NAMES) - 1 1135 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])]) 1136 1137 @classmethod 1138 def from_dict(cls, sam_dict, AlignmentHeader header): 1139 """parses a dictionary representation of the aligned segment. 1140 1141 Parameters 1142 ---------- 1143 sam_dict: 1144 dictionary of alignment values, keys corresponding to output from 1145 :meth:`todict()`. 1146 1147 """ 1148 # let htslib do the parsing 1149 # the tags field can be missing 1150 return cls.fromstring( 1151 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) + 1152 "\t" + 1153 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header) 1154 1155 ######################################################## 1156 ## Basic attributes in order of appearance in SAM format 1157 property query_name: 1158 """the query template name (None if not present)""" 1159 def __get__(self): 1160 1161 cdef bam1_t * src = self._delegate 1162 if src.core.l_qname == 0: 1163 return None 1164 1165 return charptr_to_str(<char *>pysam_bam_get_qname(src)) 1166 1167 def __set__(self, qname): 1168 1169 if qname is None or len(qname) == 0: 1170 return 1171 1172 if len(qname) > 254: 1173 raise ValueError("query length out of range {} > 254".format( 1174 len(qname))) 1175 1176 qname = force_bytes(qname) 1177 cdef bam1_t * src = self._delegate 1178 # the qname is \0 terminated 1179 cdef uint8_t l = len(qname) + 1 1180 1181 cdef char * p = pysam_bam_get_qname(src) 1182 cdef uint8_t l_extranul = 0 1183 1184 if l % 4 != 0: 1185 l_extranul = 4 - l % 4 1186 1187 cdef bam1_t * retval = pysam_bam_update(src, 1188 src.core.l_qname, 1189 l + l_extranul, 1190 <uint8_t*>p) 1191 if retval == NULL: 1192 raise MemoryError("could not allocate memory") 1193 1194 src.core.l_extranul = l_extranul 1195 src.core.l_qname = l + l_extranul 1196 1197 # re-acquire pointer to location in memory 1198 # as it might have moved 1199 p = pysam_bam_get_qname(src) 1200 1201 strncpy(p, qname, l) 1202 # x might be > 255 1203 cdef uint16_t x = 0 1204 1205 for x from l <= x < l + l_extranul: 1206 p[x] = '\0' 1207 1208 property flag: 1209 """properties flag""" 1210 def __get__(self): 1211 return self._delegate.core.flag 1212 def __set__(self, flag): 1213 self._delegate.core.flag = flag 1214 1215 property reference_name: 1216 """:term:`reference` name""" 1217 def __get__(self): 1218 if self._delegate.core.tid == -1: 1219 return None 1220 if self.header: 1221 return self.header.get_reference_name(self._delegate.core.tid) 1222 else: 1223 raise ValueError("reference_name unknown if no header associated with record") 1224 def __set__(self, reference): 1225 cdef int tid 1226 if reference is None or reference == "*": 1227 self._delegate.core.tid = -1 1228 elif self.header: 1229 tid = self.header.get_tid(reference) 1230 if tid < 0: 1231 raise ValueError("reference {} does not exist in header".format( 1232 reference)) 1233 self._delegate.core.tid = tid 1234 else: 1235 raise ValueError("reference_name can not be set if no header associated with record") 1236 1237 property reference_id: 1238 """:term:`reference` ID 1239 1240 .. note:: 1241 1242 This field contains the index of the reference sequence in 1243 the sequence dictionary. To obtain the name of the 1244 reference sequence, use :meth:`get_reference_name()` 1245 1246 """ 1247 def __get__(self): 1248 return self._delegate.core.tid 1249 def __set__(self, tid): 1250 if tid != -1 and self.header and not self.header.is_valid_tid(tid): 1251 raise ValueError("reference id {} does not exist in header".format( 1252 tid)) 1253 self._delegate.core.tid = tid 1254 1255 property reference_start: 1256 """0-based leftmost coordinate""" 1257 def __get__(self): 1258 return self._delegate.core.pos 1259 def __set__(self, pos): 1260 ## setting the position requires updating the "bin" attribute 1261 cdef bam1_t * src 1262 src = self._delegate 1263 src.core.pos = pos 1264 update_bin(src) 1265 1266 property mapping_quality: 1267 """mapping quality""" 1268 def __get__(self): 1269 return pysam_get_qual(self._delegate) 1270 def __set__(self, qual): 1271 pysam_set_qual(self._delegate, qual) 1272 1273 property cigarstring: 1274 '''the :term:`cigar` alignment as a string. 1275 1276 The cigar string is a string of alternating integers 1277 and characters denoting the length and the type of 1278 an operation. 1279 1280 .. note:: 1281 The order length,operation is specified in the 1282 SAM format. It is different from the order of 1283 the :attr:`cigar` property. 1284 1285 Returns None if not present. 1286 1287 To unset the cigarstring, assign None or the 1288 empty string. 1289 ''' 1290 def __get__(self): 1291 c = self.cigartuples 1292 if c is None: 1293 return None 1294 # reverse order 1295 else: 1296 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c]) 1297 1298 def __set__(self, cigar): 1299 if cigar is None or len(cigar) == 0: 1300 self.cigartuples = [] 1301 else: 1302 parts = CIGAR_REGEX.findall(cigar) 1303 # reverse order 1304 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts] 1305 1306 # TODO 1307 # property cigar: 1308 # """the cigar alignment""" 1309 1310 property next_reference_id: 1311 """the :term:`reference` id of the mate/next read.""" 1312 def __get__(self): 1313 return self._delegate.core.mtid 1314 def __set__(self, mtid): 1315 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid): 1316 raise ValueError("reference id {} does not exist in header".format( 1317 mtid)) 1318 self._delegate.core.mtid = mtid 1319 1320 property next_reference_name: 1321 """:term:`reference` name of the mate/next read (None if no 1322 AlignmentFile is associated)""" 1323 def __get__(self): 1324 if self._delegate.core.mtid == -1: 1325 return None 1326 if self.header: 1327 return self.header.get_reference_name(self._delegate.core.mtid) 1328 else: 1329 raise ValueError("next_reference_name unknown if no header associated with record") 1330 1331 def __set__(self, reference): 1332 cdef int mtid 1333 if reference is None or reference == "*": 1334 self._delegate.core.mtid = -1 1335 elif reference == "=": 1336 self._delegate.core.mtid = self._delegate.core.tid 1337 elif self.header: 1338 mtid = self.header.get_tid(reference) 1339 if mtid < 0: 1340 raise ValueError("reference {} does not exist in header".format( 1341 reference)) 1342 self._delegate.core.mtid = mtid 1343 else: 1344 raise ValueError("next_reference_name can not be set if no header associated with record") 1345 1346 property next_reference_start: 1347 """the position of the mate/next read.""" 1348 def __get__(self): 1349 return self._delegate.core.mpos 1350 def __set__(self, mpos): 1351 self._delegate.core.mpos = mpos 1352 1353 property query_length: 1354 """the length of the query/read. 1355 1356 This value corresponds to the length of the sequence supplied 1357 in the BAM/SAM file. The length of a query is 0 if there is no 1358 sequence in the BAM/SAM file. In those cases, the read length 1359 can be inferred from the CIGAR alignment, see 1360 :meth:`pysam.AlignedSegment.infer_query_length`. 1361 1362 The length includes soft-clipped bases and is equal to 1363 ``len(query_sequence)``. 1364 1365 This property is read-only but can be set by providing a 1366 sequence. 1367 1368 Returns 0 if not available. 1369 1370 """ 1371 def __get__(self): 1372 return self._delegate.core.l_qseq 1373 1374 property template_length: 1375 """the observed query template length""" 1376 def __get__(self): 1377 return self._delegate.core.isize 1378 def __set__(self, isize): 1379 self._delegate.core.isize = isize 1380 1381 property query_sequence: 1382 """read sequence bases, including :term:`soft clipped` bases 1383 (None if not present). 1384 1385 Note that assigning to seq will invalidate any quality scores. 1386 Thus, to in-place edit the sequence and quality scores, copies of 1387 the quality scores need to be taken. Consider trimming for example:: 1388 1389 q = read.query_qualities 1390 read.query_squence = read.query_sequence[5:10] 1391 read.query_qualities = q[5:10] 1392 1393 The sequence is returned as it is stored in the BAM file. (This will 1394 be the reverse complement of the original read sequence if the mapper 1395 has aligned the read to the reverse strand.) 1396 """ 1397 def __get__(self): 1398 if self.cache_query_sequence: 1399 return self.cache_query_sequence 1400 1401 cdef bam1_t * src 1402 cdef char * s 1403 src = self._delegate 1404 1405 if src.core.l_qseq == 0: 1406 return None 1407 1408 self.cache_query_sequence = force_str(getSequenceInRange( 1409 src, 0, src.core.l_qseq)) 1410 return self.cache_query_sequence 1411 1412 def __set__(self, seq): 1413 # samtools manages sequence and quality length memory together 1414 # if no quality information is present, the first byte says 0xff. 1415 cdef bam1_t * src 1416 cdef uint8_t * p 1417 cdef char * s 1418 cdef int l, k 1419 cdef Py_ssize_t nbytes_new, nbytes_old 1420 1421 if seq == None: 1422 l = 0 1423 else: 1424 l = len(seq) 1425 seq = force_bytes(seq) 1426 1427 src = self._delegate 1428 1429 # as the sequence is stored in half-bytes, the total length (sequence 1430 # plus quality scores) is (l+1)/2 + l 1431 nbytes_new = (l + 1) / 2 + l 1432 nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq 1433 1434 # acquire pointer to location in memory 1435 p = pysam_bam_get_seq(src) 1436 src.core.l_qseq = l 1437 1438 # change length of data field 1439 cdef bam1_t * retval = pysam_bam_update(src, 1440 nbytes_old, 1441 nbytes_new, 1442 p) 1443 1444 if retval == NULL: 1445 raise MemoryError("could not allocate memory") 1446 1447 if l > 0: 1448 # re-acquire pointer to location in memory 1449 # as it might have moved 1450 p = pysam_bam_get_seq(src) 1451 for k from 0 <= k < nbytes_new: 1452 p[k] = 0 1453 # convert to C string 1454 s = seq 1455 for k from 0 <= k < l: 1456 p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2) 1457 1458 # erase qualities 1459 p = pysam_bam_get_qual(src) 1460 p[0] = 0xff 1461 1462 self.cache_query_sequence = force_str(seq) 1463 1464 # clear cached values for quality values 1465 self.cache_query_qualities = None 1466 self.cache_query_alignment_qualities = None 1467 1468 property query_qualities: 1469 """read sequence base qualities, including :term:`soft 1470 clipped` bases (None if not present). 1471 1472 Quality scores are returned as a python array of unsigned 1473 chars. Note that this is not the ASCII-encoded value typically 1474 seen in FASTQ or SAM formatted files. Thus, no offset of 33 1475 needs to be subtracted. 1476 1477 Note that to set quality scores the sequence has to be set 1478 beforehand as this will determine the expected length of the 1479 quality score array. 1480 1481 This method raises a ValueError if the length of the 1482 quality scores and the sequence are not the same. 1483 1484 """ 1485 def __get__(self): 1486 1487 if self.cache_query_qualities: 1488 return self.cache_query_qualities 1489 1490 cdef bam1_t * src 1491 cdef char * q 1492 1493 src = self._delegate 1494 1495 if src.core.l_qseq == 0: 1496 return None 1497 1498 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) 1499 return self.cache_query_qualities 1500 1501 def __set__(self, qual): 1502 1503 # note that memory is already allocated via setting the sequence 1504 # hence length match of sequence and quality needs is checked. 1505 cdef bam1_t * src 1506 cdef uint8_t * p 1507 cdef int l 1508 1509 src = self._delegate 1510 p = pysam_bam_get_qual(src) 1511 if qual is None or len(qual) == 0: 1512 # if absent and there is a sequence: set to 0xff 1513 if src.core.l_qseq != 0: 1514 p[0] = 0xff 1515 return 1516 1517 # check for length match 1518 l = len(qual) 1519 if src.core.l_qseq != l: 1520 raise ValueError( 1521 "quality and sequence mismatch: %i != %i" % 1522 (l, src.core.l_qseq)) 1523 1524 # create a python array object filling it 1525 # with the quality scores 1526 1527 # NB: should avoid this copying if qual is 1528 # already of the correct type. 1529 cdef c_array.array result = c_array.array('B', qual) 1530 1531 # copy data 1532 memcpy(p, result.data.as_voidptr, l) 1533 1534 # save in cache 1535 self.cache_query_qualities = qual 1536 1537 property bin: 1538 """properties bin""" 1539 def __get__(self): 1540 return self._delegate.core.bin 1541 def __set__(self, bin): 1542 self._delegate.core.bin = bin 1543 1544 1545 ########################################################## 1546 # Derived simple attributes. These are simple attributes of 1547 # AlignedSegment getting and setting values. 1548 ########################################################## 1549 # 1. Flags 1550 ########################################################## 1551 property is_paired: 1552 """true if read is paired in sequencing""" 1553 def __get__(self): 1554 return (self.flag & BAM_FPAIRED) != 0 1555 def __set__(self,val): 1556 pysam_update_flag(self._delegate, val, BAM_FPAIRED) 1557 1558 property is_proper_pair: 1559 """true if read is mapped in a proper pair""" 1560 def __get__(self): 1561 return (self.flag & BAM_FPROPER_PAIR) != 0 1562 def __set__(self,val): 1563 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) 1564 property is_unmapped: 1565 """true if read itself is unmapped""" 1566 def __get__(self): 1567 return (self.flag & BAM_FUNMAP) != 0 1568 def __set__(self, val): 1569 pysam_update_flag(self._delegate, val, BAM_FUNMAP) 1570 # setting the unmapped flag requires recalculation of 1571 # bin as alignment length is now implicitly 1 1572 update_bin(self._delegate) 1573 1574 property mate_is_unmapped: 1575 """true if the mate is unmapped""" 1576 def __get__(self): 1577 return (self.flag & BAM_FMUNMAP) != 0 1578 def __set__(self,val): 1579 pysam_update_flag(self._delegate, val, BAM_FMUNMAP) 1580 property is_reverse: 1581 """true if read is mapped to reverse strand""" 1582 def __get__(self): 1583 return (self.flag & BAM_FREVERSE) != 0 1584 def __set__(self,val): 1585 pysam_update_flag(self._delegate, val, BAM_FREVERSE) 1586 property mate_is_reverse: 1587 """true is read is mapped to reverse strand""" 1588 def __get__(self): 1589 return (self.flag & BAM_FMREVERSE) != 0 1590 def __set__(self,val): 1591 pysam_update_flag(self._delegate, val, BAM_FMREVERSE) 1592 property is_read1: 1593 """true if this is read1""" 1594 def __get__(self): 1595 return (self.flag & BAM_FREAD1) != 0 1596 def __set__(self,val): 1597 pysam_update_flag(self._delegate, val, BAM_FREAD1) 1598 property is_read2: 1599 """true if this is read2""" 1600 def __get__(self): 1601 return (self.flag & BAM_FREAD2) != 0 1602 def __set__(self, val): 1603 pysam_update_flag(self._delegate, val, BAM_FREAD2) 1604 property is_secondary: 1605 """true if not primary alignment""" 1606 def __get__(self): 1607 return (self.flag & BAM_FSECONDARY) != 0 1608 def __set__(self, val): 1609 pysam_update_flag(self._delegate, val, BAM_FSECONDARY) 1610 property is_qcfail: 1611 """true if QC failure""" 1612 def __get__(self): 1613 return (self.flag & BAM_FQCFAIL) != 0 1614 def __set__(self, val): 1615 pysam_update_flag(self._delegate, val, BAM_FQCFAIL) 1616 property is_duplicate: 1617 """true if optical or PCR duplicate""" 1618 def __get__(self): 1619 return (self.flag & BAM_FDUP) != 0 1620 def __set__(self, val): 1621 pysam_update_flag(self._delegate, val, BAM_FDUP) 1622 property is_supplementary: 1623 """true if this is a supplementary alignment""" 1624 def __get__(self): 1625 return (self.flag & BAM_FSUPPLEMENTARY) != 0 1626 def __set__(self, val): 1627 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) 1628 1629 # 2. Coordinates and lengths 1630 property reference_end: 1631 '''aligned reference position of the read on the reference genome. 1632 1633 reference_end points to one past the last aligned residue. 1634 Returns None if not available (read is unmapped or no cigar 1635 alignment present). 1636 1637 ''' 1638 def __get__(self): 1639 cdef bam1_t * src 1640 src = self._delegate 1641 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: 1642 return None 1643 return bam_endpos(src) 1644 1645 property reference_length: 1646 '''aligned length of the read on the reference genome. 1647 1648 This is equal to `aend - pos`. Returns None if not available.''' 1649 def __get__(self): 1650 cdef bam1_t * src 1651 src = self._delegate 1652 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: 1653 return None 1654 return bam_endpos(src) - \ 1655 self._delegate.core.pos 1656 1657 property query_alignment_sequence: 1658 """aligned portion of the read. 1659 1660 This is a substring of :attr:`seq` that excludes flanking 1661 bases that were :term:`soft clipped` (None if not present). It 1662 is equal to ``seq[qstart:qend]``. 1663 1664 SAM/BAM files may include extra flanking bases that are not 1665 part of the alignment. These bases may be the result of the 1666 Smith-Waterman or other algorithms, which may not require 1667 alignments that begin at the first residue or end at the last. 1668 In addition, extra sequencing adapters, multiplex identifiers, 1669 and low-quality bases that were not considered for alignment 1670 may have been retained. 1671 1672 """ 1673 1674 def __get__(self): 1675 if self.cache_query_alignment_sequence: 1676 return self.cache_query_alignment_sequence 1677 1678 cdef bam1_t * src 1679 cdef uint32_t start, end 1680 1681 src = self._delegate 1682 1683 if src.core.l_qseq == 0: 1684 return None 1685 1686 start = getQueryStart(src) 1687 end = getQueryEnd(src) 1688 1689 self.cache_query_alignment_sequence = force_str( 1690 getSequenceInRange(src, start, end)) 1691 return self.cache_query_alignment_sequence 1692 1693 property query_alignment_qualities: 1694 """aligned query sequence quality values (None if not present). These 1695 are the quality values that correspond to :attr:`query`, that 1696 is, they exclude qualities of :term:`soft clipped` bases. This 1697 is equal to ``qual[qstart:qend]``. 1698 1699 Quality scores are returned as a python array of unsigned 1700 chars. Note that this is not the ASCII-encoded value typically 1701 seen in FASTQ or SAM formatted files. Thus, no offset of 33 1702 needs to be subtracted. 1703 1704 This property is read-only. 1705 1706 """ 1707 def __get__(self): 1708 1709 if self.cache_query_alignment_qualities: 1710 return self.cache_query_alignment_qualities 1711 1712 cdef bam1_t * src 1713 cdef uint32_t start, end 1714 1715 src = self._delegate 1716 1717 if src.core.l_qseq == 0: 1718 return None 1719 1720 start = getQueryStart(src) 1721 end = getQueryEnd(src) 1722 self.cache_query_alignment_qualities = \ 1723 getQualitiesInRange(src, start, end) 1724 return self.cache_query_alignment_qualities 1725 1726 property query_alignment_start: 1727 """start index of the aligned query portion of the sequence (0-based, 1728 inclusive). 1729 1730 This the index of the first base in :attr:`seq` that is not 1731 soft-clipped. 1732 """ 1733 def __get__(self): 1734 return getQueryStart(self._delegate) 1735 1736 property query_alignment_end: 1737 """end index of the aligned query portion of the sequence (0-based, 1738 exclusive) 1739 1740 This the index just past the last base in :attr:`seq` that is not 1741 soft-clipped. 1742 """ 1743 def __get__(self): 1744 return getQueryEnd(self._delegate) 1745 1746 property query_alignment_length: 1747 """length of the aligned query sequence. 1748 1749 This is equal to :attr:`qend` - :attr:`qstart`""" 1750 def __get__(self): 1751 cdef bam1_t * src 1752 src = self._delegate 1753 return getQueryEnd(src) - getQueryStart(src) 1754 1755 ##################################################### 1756 # Computed properties 1757 1758 def get_reference_positions(self, full_length=False): 1759 """a list of reference positions that this read aligns to. 1760 1761 By default, this method only returns positions in the 1762 reference that are within the alignment. If *full_length* is 1763 set, None values will be included for any soft-clipped or 1764 unaligned positions within the read. The returned list will 1765 thus be of the same length as the read. 1766 1767 """ 1768 cdef uint32_t k, i, l, pos 1769 cdef int op 1770 cdef uint32_t * cigar_p 1771 cdef bam1_t * src 1772 cdef bint _full = full_length 1773 1774 src = self._delegate 1775 if pysam_get_n_cigar(src) == 0: 1776 return [] 1777 1778 result = [] 1779 pos = src.core.pos 1780 cigar_p = pysam_bam_get_cigar(src) 1781 1782 for k from 0 <= k < pysam_get_n_cigar(src): 1783 op = cigar_p[k] & BAM_CIGAR_MASK 1784 l = cigar_p[k] >> BAM_CIGAR_SHIFT 1785 1786 if op == BAM_CSOFT_CLIP or op == BAM_CINS: 1787 if _full: 1788 for i from 0 <= i < l: 1789 result.append(None) 1790 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 1791 for i from pos <= i < pos + l: 1792 result.append(i) 1793 pos += l 1794 elif op == BAM_CDEL or op == BAM_CREF_SKIP: 1795 pos += l 1796 1797 return result 1798 1799 def infer_query_length(self, always=False): 1800 """infer query length from CIGAR alignment. 1801 1802 This method deduces the query length from the CIGAR alignment 1803 but does not include hard-clipped bases. 1804 1805 Returns None if CIGAR alignment is not present. 1806 1807 If *always* is set to True, `infer_read_length` is used instead. 1808 This is deprecated and only present for backward compatibility. 1809 """ 1810 if always is True: 1811 return self.infer_read_length() 1812 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate) 1813 if l > 0: 1814 return l 1815 else: 1816 return None 1817 1818 def infer_read_length(self): 1819 """infer read length from CIGAR alignment. 1820 1821 This method deduces the read length from the CIGAR alignment 1822 including hard-clipped bases. 1823 1824 Returns None if CIGAR alignment is not present. 1825 """ 1826 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate) 1827 if l > 0: 1828 return l 1829 else: 1830 return None 1831 1832 def get_reference_sequence(self): 1833 """return the reference sequence in the region that is covered by the 1834 alignment of the read to the reference. 1835 1836 This method requires the MD tag to be set. 1837 1838 """ 1839 return force_str(build_reference_sequence(self._delegate)) 1840 1841 def get_forward_sequence(self): 1842 """return the original read sequence. 1843 1844 Reads mapped to the reverse strand are stored reverse complemented in 1845 the BAM file. This method returns such reads reverse complemented back 1846 to their original orientation. 1847 1848 Returns None if the record has no query sequence. 1849 """ 1850 if self.query_sequence is None: 1851 return None 1852 s = force_str(self.query_sequence) 1853 if self.is_reverse: 1854 s = s.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] 1855 return s 1856 1857 def get_forward_qualities(self): 1858 """return the original base qualities of the read sequence, 1859 in the same format as the :attr:`query_qualities` property. 1860 1861 Reads mapped to the reverse strand have their base qualities stored 1862 reversed in the BAM file. This method returns such reads' base qualities 1863 reversed back to their original orientation. 1864 """ 1865 if self.is_reverse: 1866 return self.query_qualities[::-1] 1867 else: 1868 return self.query_qualities 1869 1870 1871 def get_aligned_pairs(self, matches_only=False, with_seq=False): 1872 """a list of aligned read (query) and reference positions. 1873 1874 For inserts, deletions, skipping either query or reference 1875 position may be None. 1876 1877 Padding is currently not supported and leads to an exception. 1878 1879 Parameters 1880 ---------- 1881 1882 matches_only : bool 1883 If True, only matched bases are returned - no None on either 1884 side. 1885 with_seq : bool 1886 If True, return a third element in the tuple containing the 1887 reference sequence. Substitutions are lower-case. This option 1888 requires an MD tag to be present. 1889 1890 Returns 1891 ------- 1892 1893 aligned_pairs : list of tuples 1894 1895 """ 1896 cdef uint32_t k, i, pos, qpos, r_idx, l 1897 cdef int op 1898 cdef uint32_t * cigar_p 1899 cdef bam1_t * src = self._delegate 1900 cdef bint _matches_only = bool(matches_only) 1901 cdef bint _with_seq = bool(with_seq) 1902 1903 # TODO: this method performs no checking and assumes that 1904 # read sequence, cigar and MD tag are consistent. 1905 1906 if _with_seq: 1907 # force_str required for py2/py3 compatibility 1908 ref_seq = force_str(build_reference_sequence(src)) 1909 if ref_seq is None: 1910 raise ValueError("MD tag not present") 1911 1912 r_idx = 0 1913 1914 if pysam_get_n_cigar(src) == 0: 1915 return [] 1916 1917 result = [] 1918 pos = src.core.pos 1919 qpos = 0 1920 cigar_p = pysam_bam_get_cigar(src) 1921 for k from 0 <= k < pysam_get_n_cigar(src): 1922 op = cigar_p[k] & BAM_CIGAR_MASK 1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT 1924 1925 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 1926 if _with_seq: 1927 for i from pos <= i < pos + l: 1928 result.append((qpos, i, ref_seq[r_idx])) 1929 r_idx += 1 1930 qpos += 1 1931 else: 1932 for i from pos <= i < pos + l: 1933 result.append((qpos, i)) 1934 qpos += 1 1935 pos += l 1936 1937 elif op == BAM_CINS or op == BAM_CSOFT_CLIP: 1938 if not _matches_only: 1939 if _with_seq: 1940 for i from pos <= i < pos + l: 1941 result.append((qpos, None, None)) 1942 qpos += 1 1943 else: 1944 for i from pos <= i < pos + l: 1945 result.append((qpos, None)) 1946 qpos += 1 1947 else: 1948 qpos += l 1949 1950 elif op == BAM_CDEL: 1951 if not _matches_only: 1952 if _with_seq: 1953 for i from pos <= i < pos + l: 1954 result.append((None, i, ref_seq[r_idx])) 1955 r_idx += 1 1956 else: 1957 for i from pos <= i < pos + l: 1958 result.append((None, i)) 1959 else: 1960 r_idx += l 1961 pos += l 1962 1963 elif op == BAM_CHARD_CLIP: 1964 pass # advances neither 1965 1966 elif op == BAM_CREF_SKIP: 1967 if not _matches_only: 1968 if _with_seq: 1969 for i from pos <= i < pos + l: 1970 result.append((None, i, None)) 1971 else: 1972 for i from pos <= i < pos + l: 1973 result.append((None, i)) 1974 1975 pos += l 1976 1977 elif op == BAM_CPAD: 1978 raise NotImplementedError( 1979 "Padding (BAM_CPAD, 6) is currently not supported. " 1980 "Please implement. Sorry about that.") 1981 1982 return result 1983 1984 def get_blocks(self): 1985 """ a list of start and end positions of 1986 aligned gapless blocks. 1987 1988 The start and end positions are in genomic 1989 coordinates. 1990 1991 Blocks are not normalized, i.e. two blocks 1992 might be directly adjacent. This happens if 1993 the two blocks are separated by an insertion 1994 in the read. 1995 """ 1996 1997 cdef uint32_t k, pos, l 1998 cdef int op 1999 cdef uint32_t * cigar_p 2000 cdef bam1_t * src 2001 2002 src = self._delegate 2003 if pysam_get_n_cigar(src) == 0: 2004 return [] 2005 2006 result = [] 2007 pos = src.core.pos 2008 cigar_p = pysam_bam_get_cigar(src) 2009 l = 0 2010 2011 for k from 0 <= k < pysam_get_n_cigar(src): 2012 op = cigar_p[k] & BAM_CIGAR_MASK 2013 l = cigar_p[k] >> BAM_CIGAR_SHIFT 2014 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 2015 result.append((pos, pos + l)) 2016 pos += l 2017 elif op == BAM_CDEL or op == BAM_CREF_SKIP: 2018 pos += l 2019 2020 return result 2021 2022 def get_overlap(self, uint32_t start, uint32_t end): 2023 """return number of aligned bases of read overlapping the interval 2024 *start* and *end* on the reference sequence. 2025 2026 Return None if cigar alignment is not available. 2027 """ 2028 cdef uint32_t k, i, pos, overlap 2029 cdef int op, o 2030 cdef uint32_t * cigar_p 2031 cdef bam1_t * src 2032 2033 overlap = 0 2034 2035 src = self._delegate 2036 if pysam_get_n_cigar(src) == 0: 2037 return None 2038 pos = src.core.pos 2039 o = 0 2040 2041 cigar_p = pysam_bam_get_cigar(src) 2042 for k from 0 <= k < pysam_get_n_cigar(src): 2043 op = cigar_p[k] & BAM_CIGAR_MASK 2044 l = cigar_p[k] >> BAM_CIGAR_SHIFT 2045 2046 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: 2047 o = min( pos + l, end) - max( pos, start ) 2048 if o > 0: overlap += o 2049 2050 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF: 2051 pos += l 2052 2053 return overlap 2054 2055 def get_cigar_stats(self): 2056 """summary of operations in cigar string. 2057 2058 The output order in the array is "MIDNSHP=X" followed by a 2059 field for the NM tag. If the NM tag is not present, this 2060 field will always be 0. 2061 2062 +-----+--------------+-----+ 2063 |M |BAM_CMATCH |0 | 2064 +-----+--------------+-----+ 2065 |I |BAM_CINS |1 | 2066 +-----+--------------+-----+ 2067 |D |BAM_CDEL |2 | 2068 +-----+--------------+-----+ 2069 |N |BAM_CREF_SKIP |3 | 2070 +-----+--------------+-----+ 2071 |S |BAM_CSOFT_CLIP|4 | 2072 +-----+--------------+-----+ 2073 |H |BAM_CHARD_CLIP|5 | 2074 +-----+--------------+-----+ 2075 |P |BAM_CPAD |6 | 2076 +-----+--------------+-----+ 2077 |= |BAM_CEQUAL |7 | 2078 +-----+--------------+-----+ 2079 |X |BAM_CDIFF |8 | 2080 +-----+--------------+-----+ 2081 |B |BAM_CBACK |9 | 2082 +-----+--------------+-----+ 2083 |NM |NM tag |10 | 2084 +-----+--------------+-----+ 2085 2086 If no cigar string is present, empty arrays will be returned. 2087 2088 Returns: 2089 arrays : 2090 two arrays. The first contains the nucleotide counts within 2091 each cigar operation, the second contains the number of blocks 2092 for each cigar operation. 2093 2094 """ 2095 2096 cdef int nfields = NCIGAR_CODES + 1 2097 2098 cdef c_array.array base_counts = array.array( 2099 "I", 2100 [0] * nfields) 2101 cdef uint32_t [:] base_view = base_counts 2102 cdef c_array.array block_counts = array.array( 2103 "I", 2104 [0] * nfields) 2105 cdef uint32_t [:] block_view = block_counts 2106 2107 cdef bam1_t * src = self._delegate 2108 cdef int op 2109 cdef uint32_t l 2110 cdef int32_t k 2111 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) 2112 2113 if cigar_p == NULL: 2114 return None 2115 2116 for k from 0 <= k < pysam_get_n_cigar(src): 2117 op = cigar_p[k] & BAM_CIGAR_MASK 2118 l = cigar_p[k] >> BAM_CIGAR_SHIFT 2119 base_view[op] += l 2120 block_view[op] += 1 2121 2122 cdef uint8_t * v = bam_aux_get(src, 'NM') 2123 if v != NULL: 2124 base_view[nfields - 1] = <int32_t>bam_aux2i(v) 2125 2126 return base_counts, block_counts 2127 2128 ##################################################### 2129 ## Unsorted as yet 2130 # TODO: capture in CIGAR object 2131 property cigartuples: 2132 """the :term:`cigar` alignment. The alignment 2133 is returned as a list of tuples of (operation, length). 2134 2135 If the alignment is not present, None is returned. 2136 2137 The operations are: 2138 2139 +-----+--------------+-----+ 2140 |M |BAM_CMATCH |0 | 2141 +-----+--------------+-----+ 2142 |I |BAM_CINS |1 | 2143 +-----+--------------+-----+ 2144 |D |BAM_CDEL |2 | 2145 +-----+--------------+-----+ 2146 |N |BAM_CREF_SKIP |3 | 2147 +-----+--------------+-----+ 2148 |S |BAM_CSOFT_CLIP|4 | 2149 +-----+--------------+-----+ 2150 |H |BAM_CHARD_CLIP|5 | 2151 +-----+--------------+-----+ 2152 |P |BAM_CPAD |6 | 2153 +-----+--------------+-----+ 2154 |= |BAM_CEQUAL |7 | 2155 +-----+--------------+-----+ 2156 |X |BAM_CDIFF |8 | 2157 +-----+--------------+-----+ 2158 |B |BAM_CBACK |9 | 2159 +-----+--------------+-----+ 2160 2161 .. note:: 2162 The output is a list of (operation, length) tuples, such as 2163 ``[(0, 30)]``. 2164 This is different from the SAM specification and 2165 the :attr:`cigarstring` property, which uses a 2166 (length, operation) order, for example: ``30M``. 2167 2168 To unset the cigar property, assign an empty list 2169 or None. 2170 """ 2171 def __get__(self): 2172 cdef uint32_t * cigar_p 2173 cdef bam1_t * src 2174 cdef uint32_t op, l 2175 cdef uint32_t k 2176 2177 src = self._delegate 2178 if pysam_get_n_cigar(src) == 0: 2179 return None 2180 2181 cigar = [] 2182 2183 cigar_p = pysam_bam_get_cigar(src); 2184 for k from 0 <= k < pysam_get_n_cigar(src): 2185 op = cigar_p[k] & BAM_CIGAR_MASK 2186 l = cigar_p[k] >> BAM_CIGAR_SHIFT 2187 cigar.append((op, l)) 2188 return cigar 2189 2190 def __set__(self, values): 2191 cdef uint32_t * p 2192 cdef bam1_t * src 2193 cdef op, l 2194 cdef int k 2195 2196 k = 0 2197 2198 src = self._delegate 2199 2200 # get location of cigar string 2201 p = pysam_bam_get_cigar(src) 2202 2203 # empty values for cigar string 2204 if values is None: 2205 values = [] 2206 2207 cdef uint32_t ncigar = len(values) 2208 2209 cdef bam1_t * retval = pysam_bam_update(src, 2210 pysam_get_n_cigar(src) * 4, 2211 ncigar * 4, 2212 <uint8_t*>p) 2213 2214 if retval == NULL: 2215 raise MemoryError("could not allocate memory") 2216 2217 # length is number of cigar operations, not bytes 2218 pysam_set_n_cigar(src, ncigar) 2219 2220 # re-acquire pointer to location in memory 2221 # as it might have moved 2222 p = pysam_bam_get_cigar(src) 2223 2224 # insert cigar operations 2225 for op, l in values: 2226 p[k] = l << BAM_CIGAR_SHIFT | op 2227 k += 1 2228 2229 ## setting the cigar string requires updating the bin 2230 update_bin(src) 2231 2232 cpdef set_tag(self, 2233 tag, 2234 value, 2235 value_type=None, 2236 replace=True): 2237 """sets a particular field *tag* to *value* in the optional alignment 2238 section. 2239 2240 *value_type* describes the type of *value* that is to entered 2241 into the alignment record. It can be set explicitly to one of 2242 the valid one-letter type codes. If unset, an appropriate type 2243 will be chosen automatically based on the python type of 2244 *value*. 2245 2246 An existing value of the same *tag* will be overwritten unless 2247 *replace* is set to False. This is usually not recommended as a 2248 tag may only appear once in the optional alignment section. 2249 2250 If *value* is None, the tag will be deleted. 2251 2252 This method accepts valid SAM specification value types, which 2253 are:: 2254 2255 A: printable char 2256 i: signed int 2257 f: float 2258 Z: printable string 2259 H: Byte array in hex format 2260 B: Integer or numeric array 2261 2262 Additionally, it will accept the integer BAM types ('cCsSI') 2263 2264 For htslib compatibility, 'a' is synonymous with 'A' and the 2265 method accepts a 'd' type code for a double precision float. 2266 2267 When deducing the type code by the python type of *value*, the 2268 following mapping is applied:: 2269 2270 i: python int 2271 f: python float 2272 Z: python str or bytes 2273 B: python array.array, list or tuple 2274 2275 Note that a single character string will be output as 'Z' and 2276 not 'A' as the former is the more general type. 2277 """ 2278 2279 cdef int value_size 2280 cdef uint8_t tc 2281 cdef uint8_t * value_ptr 2282 cdef uint8_t *existing_ptr 2283 cdef float float_value 2284 cdef double double_value 2285 cdef int32_t int32_t_value 2286 cdef uint32_t uint32_t_value 2287 cdef int16_t int16_t_value 2288 cdef uint16_t uint16_t_value 2289 cdef int8_t int8_t_value 2290 cdef uint8_t uint8_t_value 2291 cdef bam1_t * src = self._delegate 2292 cdef char * _value_type 2293 cdef c_array.array array_value 2294 cdef object buffer 2295 2296 if len(tag) != 2: 2297 raise ValueError('Invalid tag: %s' % tag) 2298 2299 tag = force_bytes(tag) 2300 if replace: 2301 existing_ptr = bam_aux_get(src, tag) 2302 if existing_ptr: 2303 bam_aux_del(src, existing_ptr) 2304 2305 # setting value to None deletes a tag 2306 if value is None: 2307 return 2308 2309 cdef uint8_t typecode = get_tag_typecode(value, value_type) 2310 if typecode == 0: 2311 raise ValueError("can't guess type or invalid type code specified: {} {}".format( 2312 value, value_type)) 2313 2314 # sam_format1 for typecasting 2315 if typecode == 'Z': 2316 value = force_bytes(value) 2317 value_ptr = <uint8_t*><char*>value 2318 value_size = len(value)+1 2319 elif typecode == 'H': 2320 # Note that hex tags are stored the very same 2321 # way as Z string.s 2322 value = force_bytes(value) 2323 value_ptr = <uint8_t*><char*>value 2324 value_size = len(value)+1 2325 elif typecode == 'A' or typecode == 'a': 2326 value = force_bytes(value) 2327 value_ptr = <uint8_t*><char*>value 2328 value_size = sizeof(char) 2329 typecode = 'A' 2330 elif typecode == 'i': 2331 int32_t_value = value 2332 value_ptr = <uint8_t*>&int32_t_value 2333 value_size = sizeof(int32_t) 2334 elif typecode == 'I': 2335 uint32_t_value = value 2336 value_ptr = <uint8_t*>&uint32_t_value 2337 value_size = sizeof(uint32_t) 2338 elif typecode == 's': 2339 int16_t_value = value 2340 value_ptr = <uint8_t*>&int16_t_value 2341 value_size = sizeof(int16_t) 2342 elif typecode == 'S': 2343 uint16_t_value = value 2344 value_ptr = <uint8_t*>&uint16_t_value 2345 value_size = sizeof(uint16_t) 2346 elif typecode == 'c': 2347 int8_t_value = value 2348 value_ptr = <uint8_t*>&int8_t_value 2349 value_size = sizeof(int8_t) 2350 elif typecode == 'C': 2351 uint8_t_value = value 2352 value_ptr = <uint8_t*>&uint8_t_value 2353 value_size = sizeof(uint8_t) 2354 elif typecode == 'd': 2355 double_value = value 2356 value_ptr = <uint8_t*>&double_value 2357 value_size = sizeof(double) 2358 elif typecode == 'f': 2359 float_value = value 2360 value_ptr = <uint8_t*>&float_value 2361 value_size = sizeof(float) 2362 elif typecode == 'B': 2363 # the following goes through python, needs to be cleaned up 2364 # pack array using struct 2365 fmt, args = pack_tags([(tag, value, value_type)]) 2366 2367 # remove tag and type code as set by bam_aux_append 2368 # first four chars of format (<2sB) 2369 fmt = '<' + fmt[4:] 2370 # first two values to pack 2371 args = args[2:] 2372 value_size = struct.calcsize(fmt) 2373 # buffer will be freed when object goes out of scope 2374 buffer = ctypes.create_string_buffer(value_size) 2375 struct.pack_into(fmt, buffer, 0, *args) 2376 # bam_aux_append copies data from value_ptr 2377 bam_aux_append(src, 2378 tag, 2379 typecode, 2380 value_size, 2381 <uint8_t*>buffer.raw) 2382 return 2383 else: 2384 raise ValueError('unsupported value_type {} in set_option'.format(typecode)) 2385 2386 bam_aux_append(src, 2387 tag, 2388 typecode, 2389 value_size, 2390 value_ptr) 2391 2392 cpdef has_tag(self, tag): 2393 """returns true if the optional alignment section 2394 contains a given *tag*.""" 2395 cdef uint8_t * v 2396 cdef int nvalues 2397 btag = force_bytes(tag) 2398 v = bam_aux_get(self._delegate, btag) 2399 return v != NULL 2400 2401 cpdef get_tag(self, tag, with_value_type=False): 2402 """ 2403 retrieves data from the optional alignment section 2404 given a two-letter *tag* denoting the field. 2405 2406 The returned value is cast into an appropriate python type. 2407 2408 This method is the fastest way to access the optional 2409 alignment section if only few tags need to be retrieved. 2410 2411 Possible value types are "AcCsSiIfZHB" (see BAM format 2412 specification) as well as additional value type 'd' as 2413 implemented in htslib. 2414 2415 Parameters: 2416 2417 tag : 2418 data tag. 2419 2420 with_value_type : Optional[bool] 2421 if set to True, the return value is a tuple of (tag value, type 2422 code). (default False) 2423 2424 Returns: 2425 2426 A python object with the value of the `tag`. The type of the 2427 object depends on the data type in the data record. 2428 2429 Raises: 2430 2431 KeyError 2432 If `tag` is not present, a KeyError is raised. 2433 2434 """ 2435 cdef uint8_t * v 2436 cdef int nvalues 2437 btag = force_bytes(tag) 2438 v = bam_aux_get(self._delegate, btag) 2439 if v == NULL: 2440 raise KeyError("tag '%s' not present" % tag) 2441 if chr(v[0]) == "B": 2442 auxtype = chr(v[0]) + chr(v[1]) 2443 else: 2444 auxtype = chr(v[0]) 2445 2446 if auxtype in "iIcCsS": 2447 value = bam_aux2i(v) 2448 elif auxtype == 'f' or auxtype == 'F': 2449 value = bam_aux2f(v) 2450 elif auxtype == 'd' or auxtype == 'D': 2451 value = bam_aux2f(v) 2452 elif auxtype == 'A' or auxtype == 'a': 2453 # force A to a 2454 v[0] = 'A' 2455 # there might a more efficient way 2456 # to convert a char into a string 2457 value = '%c' % <char>bam_aux2A(v) 2458 elif auxtype == 'Z' or auxtype == 'H': 2459 # Z and H are treated equally as strings in htslib 2460 value = charptr_to_str(<char*>bam_aux2Z(v)) 2461 elif auxtype[0] == 'B': 2462 bytesize, nvalues, values = convert_binary_tag(v + 1) 2463 value = values 2464 else: 2465 raise ValueError("unknown auxiliary type '%s'" % auxtype) 2466 2467 if with_value_type: 2468 return (value, auxtype) 2469 else: 2470 return value 2471 2472 def get_tags(self, with_value_type=False): 2473 """the fields in the optional alignment section. 2474 2475 Returns a list of all fields in the optional 2476 alignment section. Values are converted to appropriate python 2477 values. For example: 2478 2479 [(NM, 2), (RG, "GJP00TM04")] 2480 2481 If *with_value_type* is set, the value type as encode in 2482 the AlignedSegment record will be returned as well: 2483 2484 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")] 2485 2486 This method will convert all values in the optional alignment 2487 section. When getting only one or few tags, please see 2488 :meth:`get_tag` for a quicker way to achieve this. 2489 2490 """ 2491 2492 cdef char * ctag 2493 cdef bam1_t * src 2494 cdef uint8_t * s 2495 cdef char auxtag[3] 2496 cdef char auxtype 2497 cdef uint8_t byte_size 2498 cdef int32_t nvalues 2499 2500 src = self._delegate 2501 if src.l_data == 0: 2502 return [] 2503 s = pysam_bam_get_aux(src) 2504 result = [] 2505 auxtag[2] = 0 2506 while s < (src.data + src.l_data): 2507 # get tag 2508 auxtag[0] = s[0] 2509 auxtag[1] = s[1] 2510 s += 2 2511 auxtype = s[0] 2512 if auxtype in ('c', 'C'): 2513 value = <int>bam_aux2i(s) 2514 s += 1 2515 elif auxtype in ('s', 'S'): 2516 value = <int>bam_aux2i(s) 2517 s += 2 2518 elif auxtype in ('i', 'I'): 2519 value = <int32_t>bam_aux2i(s) 2520 s += 4 2521 elif auxtype == 'f': 2522 value = <float>bam_aux2f(s) 2523 s += 4 2524 elif auxtype == 'd': 2525 value = <double>bam_aux2f(s) 2526 s += 8 2527 elif auxtype in ('A', 'a'): 2528 value = "%c" % <char>bam_aux2A(s) 2529 s += 1 2530 elif auxtype in ('Z', 'H'): 2531 value = charptr_to_str(<char*>bam_aux2Z(s)) 2532 # +1 for NULL terminated string 2533 s += len(value) + 1 2534 elif auxtype == 'B': 2535 s += 1 2536 byte_size, nvalues, value = convert_binary_tag(s) 2537 # 5 for 1 char and 1 int 2538 s += 5 + (nvalues * byte_size) - 1 2539 else: 2540 raise KeyError("unknown type '%s'" % auxtype) 2541 2542 s += 1 2543 2544 if with_value_type: 2545 result.append((charptr_to_str(auxtag), value, chr(auxtype))) 2546 else: 2547 result.append((charptr_to_str(auxtag), value)) 2548 2549 return result 2550 2551 def set_tags(self, tags): 2552 """sets the fields in the optional alignment section with 2553 a list of (tag, value) tuples. 2554 2555 The :term:`value type` of the values is determined from the 2556 python type. Optionally, a type may be given explicitly as 2557 a third value in the tuple, For example: 2558 2559 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")] 2560 2561 This method will not enforce the rule that the same tag may appear 2562 only once in the optional alignment section. 2563 """ 2564 2565 cdef bam1_t * src 2566 cdef uint8_t * s 2567 cdef char * temp 2568 cdef int new_size = 0 2569 cdef int old_size 2570 src = self._delegate 2571 2572 # convert and pack the data 2573 if tags is not None and len(tags) > 0: 2574 fmt, args = pack_tags(tags) 2575 new_size = struct.calcsize(fmt) 2576 buffer = ctypes.create_string_buffer(new_size) 2577 struct.pack_into(fmt, 2578 buffer, 2579 0, 2580 *args) 2581 2582 2583 # delete the old data and allocate new space. 2584 # If total_size == 0, the aux field will be 2585 # empty 2586 old_size = pysam_bam_get_l_aux(src) 2587 cdef bam1_t * retval = pysam_bam_update(src, 2588 old_size, 2589 new_size, 2590 pysam_bam_get_aux(src)) 2591 if retval == NULL: 2592 raise MemoryError("could not allocated memory") 2593 2594 # copy data only if there is any 2595 if new_size > 0: 2596 2597 # get location of new data 2598 s = pysam_bam_get_aux(src) 2599 2600 # check if there is direct path from buffer.raw to tmp 2601 p = buffer.raw 2602 # create handle to make sure buffer stays alive long 2603 # enough for memcpy, see issue 129 2604 temp = p 2605 memcpy(s, temp, new_size) 2606 2607 2608 ######################################################## 2609 # Compatibility Accessors 2610 # Functions, properties for compatibility with pysam < 0.8 2611 # 2612 # Several options 2613 # change the factory functions according to API 2614 # * requires code changes throughout, incl passing 2615 # handles to factory functions 2616 # subclass functions and add attributes at runtime 2617 # e.g.: AlignedSegments.qname = AlignedSegments.query_name 2618 # * will slow down the default interface 2619 # explicit declaration of getters/setters 2620 ######################################################## 2621 property qname: 2622 """deprecated, use query_name instead""" 2623 def __get__(self): return self.query_name 2624 def __set__(self, v): self.query_name = v 2625 property tid: 2626 """deprecated, use reference_id instead""" 2627 def __get__(self): return self.reference_id 2628 def __set__(self, v): self.reference_id = v 2629 property pos: 2630 """deprecated, use reference_start instead""" 2631 def __get__(self): return self.reference_start 2632 def __set__(self, v): self.reference_start = v 2633 property mapq: 2634 """deprecated, use mapping_quality instead""" 2635 def __get__(self): return self.mapping_quality 2636 def __set__(self, v): self.mapping_quality = v 2637 property rnext: 2638 """deprecated, use next_reference_id instead""" 2639 def __get__(self): return self.next_reference_id 2640 def __set__(self, v): self.next_reference_id = v 2641 property pnext: 2642 """deprecated, use next_reference_start instead""" 2643 def __get__(self): 2644 return self.next_reference_start 2645 def __set__(self, v): 2646 self.next_reference_start = v 2647 property cigar: 2648 """deprecated, use cigartuples instead""" 2649 def __get__(self): 2650 r = self.cigartuples 2651 if r is None: 2652 r = [] 2653 return r 2654 def __set__(self, v): self.cigartuples = v 2655 property tlen: 2656 """deprecated, use template_length instead""" 2657 def __get__(self): 2658 return self.template_length 2659 def __set__(self, v): 2660 self.template_length = v 2661 property seq: 2662 """deprecated, use query_sequence instead""" 2663 def __get__(self): 2664 return self.query_sequence 2665 def __set__(self, v): 2666 self.query_sequence = v 2667 property qual: 2668 """deprecated, query_qualities instead""" 2669 def __get__(self): 2670 return array_to_qualitystring(self.query_qualities) 2671 def __set__(self, v): 2672 self.query_qualities = qualitystring_to_array(v) 2673 property alen: 2674 """deprecated, reference_length instead""" 2675 def __get__(self): 2676 return self.reference_length 2677 def __set__(self, v): 2678 self.reference_length = v 2679 property aend: 2680 """deprecated, reference_end instead""" 2681 def __get__(self): 2682 return self.reference_end 2683 def __set__(self, v): 2684 self.reference_end = v 2685 property rlen: 2686 """deprecated, query_length instead""" 2687 def __get__(self): 2688 return self.query_length 2689 def __set__(self, v): 2690 self.query_length = v 2691 property query: 2692 """deprecated, query_alignment_sequence instead""" 2693 def __get__(self): 2694 return self.query_alignment_sequence 2695 def __set__(self, v): 2696 self.query_alignment_sequence = v 2697 property qqual: 2698 """deprecated, query_alignment_qualities instead""" 2699 def __get__(self): 2700 return array_to_qualitystring(self.query_alignment_qualities) 2701 def __set__(self, v): 2702 self.query_alignment_qualities = qualitystring_to_array(v) 2703 property qstart: 2704 """deprecated, use query_alignment_start instead""" 2705 def __get__(self): 2706 return self.query_alignment_start 2707 def __set__(self, v): 2708 self.query_alignment_start = v 2709 property qend: 2710 """deprecated, use query_alignment_end instead""" 2711 def __get__(self): 2712 return self.query_alignment_end 2713 def __set__(self, v): 2714 self.query_alignment_end = v 2715 property qlen: 2716 """deprecated, use query_alignment_length instead""" 2717 def __get__(self): 2718 return self.query_alignment_length 2719 def __set__(self, v): 2720 self.query_alignment_length = v 2721 property mrnm: 2722 """deprecated, use next_reference_id instead""" 2723 def __get__(self): 2724 return self.next_reference_id 2725 def __set__(self, v): 2726 self.next_reference_id = v 2727 property mpos: 2728 """deprecated, use next_reference_start instead""" 2729 def __get__(self): 2730 return self.next_reference_start 2731 def __set__(self, v): 2732 self.next_reference_start = v 2733 property rname: 2734 """deprecated, use reference_id instead""" 2735 def __get__(self): 2736 return self.reference_id 2737 def __set__(self, v): 2738 self.reference_id = v 2739 property isize: 2740 """deprecated, use template_length instead""" 2741 def __get__(self): 2742 return self.template_length 2743 def __set__(self, v): 2744 self.template_length = v 2745 property blocks: 2746 """deprecated, use get_blocks() instead""" 2747 def __get__(self): 2748 return self.get_blocks() 2749 property aligned_pairs: 2750 """deprecated, use get_aligned_pairs() instead""" 2751 def __get__(self): 2752 return self.get_aligned_pairs() 2753 property inferred_length: 2754 """deprecated, use infer_query_length() instead""" 2755 def __get__(self): 2756 return self.infer_query_length() 2757 property positions: 2758 """deprecated, use get_reference_positions() instead""" 2759 def __get__(self): 2760 return self.get_reference_positions() 2761 property tags: 2762 """deprecated, use get_tags() instead""" 2763 def __get__(self): 2764 return self.get_tags() 2765 def __set__(self, tags): 2766 self.set_tags(tags) 2767 def overlap(self): 2768 """deprecated, use get_overlap() instead""" 2769 return self.get_overlap() 2770 def opt(self, tag): 2771 """deprecated, use get_tag() instead""" 2772 return self.get_tag(tag) 2773 def setTag(self, tag, value, value_type=None, replace=True): 2774 """deprecated, use set_tag() instead""" 2775 return self.set_tag(tag, value, value_type, replace) 2776 2777 2778cdef class PileupColumn: 2779 '''A pileup of reads at a particular reference sequence position 2780 (:term:`column`). A pileup column contains all the reads that map 2781 to a certain target base. 2782 2783 This class is a proxy for results returned by the samtools pileup 2784 engine. If the underlying engine iterator advances, the results 2785 of this column will change. 2786 ''' 2787 def __init__(self): 2788 raise TypeError("this class cannot be instantiated from Python") 2789 2790 def __str__(self): 2791 return "\t".join(map(str, 2792 (self.reference_id, 2793 self.reference_pos, 2794 self.nsegments))) +\ 2795 "\n" +\ 2796 "\n".join(map(str, self.pileups)) 2797 2798 def __dealloc__(self): 2799 free(self.buf.s) 2800 2801 def set_min_base_quality(self, min_base_quality): 2802 """set the minimum base quality for this pileup column. 2803 """ 2804 self.min_base_quality = min_base_quality 2805 2806 def __len__(self): 2807 """return number of reads aligned to this column. 2808 2809 see :meth:`get_num_aligned` 2810 """ 2811 return self.get_num_aligned() 2812 2813 property reference_id: 2814 '''the reference sequence number as defined in the header''' 2815 def __get__(self): 2816 return self.tid 2817 2818 property reference_name: 2819 """:term:`reference` name (None if no AlignmentFile is associated)""" 2820 def __get__(self): 2821 if self.header is not None: 2822 return self.header.get_reference_name(self.tid) 2823 return None 2824 2825 property nsegments: 2826 '''number of reads mapping to this column. 2827 2828 Note that this number ignores the base quality filter.''' 2829 def __get__(self): 2830 return self.n_pu 2831 def __set__(self, n): 2832 self.n_pu = n 2833 2834 property reference_pos: 2835 '''the position in the reference sequence (0-based).''' 2836 def __get__(self): 2837 return self.pos 2838 2839 property pileups: 2840 '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' 2841 def __get__(self): 2842 if self.plp == NULL or self.plp[0] == NULL: 2843 raise ValueError("PileupColumn accessed after iterator finished") 2844 2845 cdef int x 2846 cdef const bam_pileup1_t * p = NULL 2847 pileups = [] 2848 2849 # warning: there could be problems if self.n and self.buf are 2850 # out of sync. 2851 for x from 0 <= x < self.n_pu: 2852 p = &(self.plp[0][x]) 2853 if p == NULL: 2854 raise ValueError( 2855 "pileup buffer out of sync - most likely use of iterator " 2856 "outside loop") 2857 if pileup_base_qual_skip(p, self.min_base_quality): 2858 continue 2859 pileups.append(makePileupRead(p, self.header)) 2860 return pileups 2861 2862 ######################################################## 2863 # Compatibility Accessors 2864 # Functions, properties for compatibility with pysam < 0.8 2865 ######################################################## 2866 property pos: 2867 """deprecated: use reference_pos""" 2868 def __get__(self): 2869 return self.reference_pos 2870 def __set__(self, v): 2871 self.reference_pos = v 2872 2873 property tid: 2874 """deprecated: use reference_id""" 2875 def __get__(self): 2876 return self.reference_id 2877 def __set__(self, v): 2878 self.reference_id = v 2879 2880 property n: 2881 """deprecated: use nsegments""" 2882 def __get__(self): 2883 return self.nsegments 2884 def __set__(self, v): 2885 self.nsegments = v 2886 2887 def get_num_aligned(self): 2888 """return number of aligned bases at pileup column position. 2889 2890 This method applies a base quality filter and the number is 2891 equal to the size of :meth:`get_query_sequences`, 2892 :meth:`get_mapping_qualities`, etc. 2893 2894 """ 2895 cdef uint32_t x = 0 2896 cdef uint32_t c = 0 2897 cdef uint32_t cnt = 0 2898 cdef const bam_pileup1_t * p = NULL 2899 if self.plp == NULL or self.plp[0] == NULL: 2900 raise ValueError("PileupColumn accessed after iterator finished") 2901 2902 for x from 0 <= x < self.n_pu: 2903 p = &(self.plp[0][x]) 2904 if p == NULL: 2905 raise ValueError( 2906 "pileup buffer out of sync - most likely use of iterator " 2907 "outside loop") 2908 if pileup_base_qual_skip(p, self.min_base_quality): 2909 continue 2910 cnt += 1 2911 return cnt 2912 2913 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False): 2914 """query bases/sequences at pileup column position. 2915 2916 Optionally, the bases/sequences can be annotated according to the samtools 2917 mpileup format. This is the format description from the samtools mpileup tool:: 2918 2919 Information on match, mismatch, indel, strand, mapping 2920 quality and start and end of a read are all encoded at the 2921 read base column. At this column, a dot stands for a match 2922 to the reference base on the forward strand, a comma for a 2923 match on the reverse strand, a '>' or '<' for a reference 2924 skip, `ACGTN' for a mismatch on the forward strand and 2925 `acgtn' for a mismatch on the reverse strand. A pattern 2926 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion 2927 between this reference position and the next reference 2928 position. The length of the insertion is given by the 2929 integer in the pattern, followed by the inserted 2930 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' 2931 represents a deletion from the reference. The deleted bases 2932 will be presented as `*' in the following lines. Also at 2933 the read base column, a symbol `^' marks the start of a 2934 read. The ASCII of the character following `^' minus 33 2935 gives the mapping quality. A symbol `$' marks the end of a 2936 read segment 2937 2938 To reproduce samtools mpileup format, set all of mark_matches, 2939 mark_ends and add_indels to True. 2940 2941 Parameters 2942 ---------- 2943 2944 mark_matches: bool 2945 2946 If True, output bases matching the reference as "." or "," 2947 for forward and reverse strand, respectively. This mark 2948 requires the reference sequence. If no reference is 2949 present, this option is ignored. 2950 2951 mark_ends : bool 2952 2953 If True, add markers "^" and "$" for read start and end, respectively. 2954 2955 add_indels : bool 2956 2957 If True, add bases for bases inserted into or skipped from the 2958 reference. The latter requires a reference sequence file to have 2959 been given, e.g. via `pileup(fastafile = ...)`. If no reference 2960 sequence is available, skipped bases are represented as 'N's. 2961 2962 Returns 2963 ------- 2964 2965 list: a list of bases/sequences per read at pileup column position. 2966 2967 """ 2968 cdef uint32_t x = 0 2969 cdef uint32_t j = 0 2970 cdef uint32_t c = 0 2971 cdef uint8_t cc = 0 2972 cdef uint8_t rb = 0 2973 cdef kstring_t * buf = &self.buf 2974 cdef const bam_pileup1_t * p = NULL 2975 2976 if self.plp == NULL or self.plp[0] == NULL: 2977 raise ValueError("PileupColumn accessed after iterator finished") 2978 2979 buf.l = 0 2980 2981 # todo: reference sequence to count matches/mismatches 2982 # todo: convert assertions to exceptions 2983 for x from 0 <= x < self.n_pu: 2984 p = &(self.plp[0][x]) 2985 if p == NULL: 2986 raise ValueError( 2987 "pileup buffer out of sync - most likely use of iterator " 2988 "outside loop") 2989 if pileup_base_qual_skip(p, self.min_base_quality): 2990 continue 2991 # see samtools pileup_seq 2992 if mark_ends and p.is_head: 2993 kputc('^', buf) 2994 2995 if p.b.core.qual > 93: 2996 kputc(126, buf) 2997 else: 2998 kputc(p.b.core.qual + 33, buf) 2999 if not p.is_del: 3000 if p.qpos < p.b.core.l_qseq: 3001 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] 3002 else: 3003 cc = 'N' 3004 3005 if mark_matches and self.reference_sequence != NULL: 3006 rb = self.reference_sequence[self.reference_pos] 3007 if seq_nt16_table[cc] == seq_nt16_table[rb]: 3008 cc = "=" 3009 kputc(strand_mark_char(cc, p.b), buf) 3010 elif add_indels: 3011 if p.is_refskip: 3012 if bam_is_rev(p.b): 3013 kputc('<', buf) 3014 else: 3015 kputc('>', buf) 3016 else: 3017 kputc('*', buf) 3018 if add_indels: 3019 if p.indel > 0: 3020 kputc('+', buf) 3021 kputw(p.indel, buf) 3022 for j from 1 <= j <= p.indel: 3023 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] 3024 kputc(strand_mark_char(cc, p.b), buf) 3025 elif p.indel < 0: 3026 kputc('-', buf) 3027 kputw(-p.indel, buf) 3028 for j from 1 <= j <= -p.indel: 3029 # TODO: out-of-range check here? 3030 if self.reference_sequence == NULL: 3031 cc = 'N' 3032 else: 3033 cc = self.reference_sequence[self.reference_pos + j] 3034 kputc(strand_mark_char(cc, p.b), buf) 3035 if mark_ends and p.is_tail: 3036 kputc('$', buf) 3037 3038 kputc(':', buf) 3039 3040 if buf.l == 0: 3041 # could be zero if all qualities are too low 3042 return "" 3043 else: 3044 # quicker to ensemble all and split than to encode all separately. 3045 # ignore last ":" 3046 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":") 3047 3048 def get_query_qualities(self): 3049 """query base quality scores at pileup column position. 3050 3051 Returns 3052 ------- 3053 3054 list: a list of quality scores 3055 """ 3056 cdef uint32_t x = 0 3057 cdef const bam_pileup1_t * p = NULL 3058 cdef uint32_t c = 0 3059 result = [] 3060 for x from 0 <= x < self.n_pu: 3061 p = &(self.plp[0][x]) 3062 if p == NULL: 3063 raise ValueError( 3064 "pileup buffer out of sync - most likely use of iterator " 3065 "outside loop") 3066 3067 if p.qpos < p.b.core.l_qseq: 3068 c = bam_get_qual(p.b)[p.qpos] 3069 else: 3070 c = 0 3071 if c < self.min_base_quality: 3072 continue 3073 result.append(c) 3074 return result 3075 3076 def get_mapping_qualities(self): 3077 """query mapping quality scores at pileup column position. 3078 3079 Returns 3080 ------- 3081 3082 list: a list of quality scores 3083 """ 3084 if self.plp == NULL or self.plp[0] == NULL: 3085 raise ValueError("PileupColumn accessed after iterator finished") 3086 3087 cdef uint32_t x = 0 3088 cdef const bam_pileup1_t * p = NULL 3089 result = [] 3090 for x from 0 <= x < self.n_pu: 3091 p = &(self.plp[0][x]) 3092 if p == NULL: 3093 raise ValueError( 3094 "pileup buffer out of sync - most likely use of iterator " 3095 "outside loop") 3096 3097 if pileup_base_qual_skip(p, self.min_base_quality): 3098 continue 3099 result.append(p.b.core.qual) 3100 return result 3101 3102 def get_query_positions(self): 3103 """positions in read at pileup column position. 3104 3105 Returns 3106 ------- 3107 3108 list: a list of read positions 3109 """ 3110 if self.plp == NULL or self.plp[0] == NULL: 3111 raise ValueError("PileupColumn accessed after iterator finished") 3112 3113 cdef uint32_t x = 0 3114 cdef const bam_pileup1_t * p = NULL 3115 result = [] 3116 for x from 0 <= x < self.n_pu: 3117 p = &(self.plp[0][x]) 3118 if p == NULL: 3119 raise ValueError( 3120 "pileup buffer out of sync - most likely use of iterator " 3121 "outside loop") 3122 3123 if pileup_base_qual_skip(p, self.min_base_quality): 3124 continue 3125 result.append(p.qpos) 3126 return result 3127 3128 def get_query_names(self): 3129 """query/read names aligned at pileup column position. 3130 3131 Returns 3132 ------- 3133 3134 list: a list of query names at pileup column position. 3135 """ 3136 if self.plp == NULL or self.plp[0] == NULL: 3137 raise ValueError("PileupColumn accessed after iterator finished") 3138 3139 cdef uint32_t x = 0 3140 cdef const bam_pileup1_t * p = NULL 3141 result = [] 3142 for x from 0 <= x < self.n_pu: 3143 p = &(self.plp[0][x]) 3144 if p == NULL: 3145 raise ValueError( 3146 "pileup buffer out of sync - most likely use of iterator " 3147 "outside loop") 3148 3149 if pileup_base_qual_skip(p, self.min_base_quality): 3150 continue 3151 result.append(charptr_to_str(pysam_bam_get_qname(p.b))) 3152 return result 3153 3154 3155cdef class PileupRead: 3156 '''Representation of a read aligned to a particular position in the 3157 reference sequence. 3158 3159 ''' 3160 3161 def __init__(self): 3162 raise TypeError( 3163 "this class cannot be instantiated from Python") 3164 3165 def __str__(self): 3166 return "\t".join( 3167 map(str, 3168 (self.alignment, self.query_position, 3169 self.indel, self.level, 3170 self.is_del, self.is_head, 3171 self.is_tail, self.is_refskip))) 3172 3173 property alignment: 3174 """a :class:`pysam.AlignedSegment` object of the aligned read""" 3175 def __get__(self): 3176 return self._alignment 3177 3178 property query_position: 3179 """position of the read base at the pileup site, 0-based. 3180 None if is_del or is_refskip is set. 3181 3182 """ 3183 def __get__(self): 3184 if self.is_del or self.is_refskip: 3185 return None 3186 else: 3187 return self._qpos 3188 3189 property query_position_or_next: 3190 """position of the read base at the pileup site, 0-based. 3191 3192 If the current position is a deletion, returns the next 3193 aligned base. 3194 3195 """ 3196 def __get__(self): 3197 return self._qpos 3198 3199 property indel: 3200 """indel length for the position following the current pileup site. 3201 3202 This quantity peeks ahead to the next cigar operation in this 3203 alignment. If the next operation is an insertion, indel will 3204 be positive. If the next operation is a deletion, it will be 3205 negation. 0 if the next operation is not an indel. 3206 3207 """ 3208 def __get__(self): 3209 return self._indel 3210 3211 property level: 3212 """the level of the read in the "viewer" mode. Note that this value 3213 is currently not computed.""" 3214 def __get__(self): 3215 return self._level 3216 3217 property is_del: 3218 """1 iff the base on the padded read is a deletion""" 3219 def __get__(self): 3220 return self._is_del 3221 3222 property is_head: 3223 """1 iff the base on the padded read is the left-most base.""" 3224 def __get__(self): 3225 return self._is_head 3226 3227 property is_tail: 3228 """1 iff the base on the padded read is the right-most base.""" 3229 def __get__(self): 3230 return self._is_tail 3231 3232 property is_refskip: 3233 """1 iff the base on the padded read is part of CIGAR N op.""" 3234 def __get__(self): 3235 return self._is_refskip 3236 3237 3238 3239cpdef enum CIGAR_OPS: 3240 CMATCH = 0 3241 CINS = 1 3242 CDEL = 2 3243 CREF_SKIP = 3 3244 CSOFT_CLIP = 4 3245 CHARD_CLIP = 5 3246 CPAD = 6 3247 CEQUAL = 7 3248 CDIFF = 8 3249 CBACK = 9 3250 3251 3252cpdef enum SAM_FLAGS: 3253 # the read is paired in sequencing, no matter whether it is mapped in a pair 3254 FPAIRED = 1 3255 # the read is mapped in a proper pair 3256 FPROPER_PAIR = 2 3257 # the read itself is unmapped; conflictive with FPROPER_PAIR 3258 FUNMAP = 4 3259 # the mate is unmapped 3260 FMUNMAP = 8 3261 # the read is mapped to the reverse strand 3262 FREVERSE = 16 3263 # the mate is mapped to the reverse strand 3264 FMREVERSE = 32 3265 # this is read1 3266 FREAD1 = 64 3267 # this is read2 3268 FREAD2 = 128 3269 # not primary alignment 3270 FSECONDARY = 256 3271 # QC failure 3272 FQCFAIL = 512 3273 # optical or PCR duplicate 3274 FDUP = 1024 3275 # supplementary alignment 3276 FSUPPLEMENTARY = 2048 3277 3278 3279__all__ = [ 3280 "AlignedSegment", 3281 "PileupColumn", 3282 "PileupRead", 3283 "CMATCH", 3284 "CINS", 3285 "CDEL", 3286 "CREF_SKIP", 3287 "CSOFT_CLIP", 3288 "CHARD_CLIP", 3289 "CPAD", 3290 "CEQUAL", 3291 "CDIFF", 3292 "CBACK", 3293 "FPAIRED", 3294 "FPROPER_PAIR", 3295 "FUNMAP", 3296 "FMUNMAP", 3297 "FREVERSE", 3298 "FMREVERSE", 3299 "FREAD1", 3300 "FREAD2", 3301 "FSECONDARY", 3302 "FQCFAIL", 3303 "FDUP", 3304 "FSUPPLEMENTARY", 3305 "KEY_NAMES"] 3306