1# cython: embedsignature=True
2# cython: profile=True
3###############################################################################
4###############################################################################
5# Cython wrapper for SAM/BAM/CRAM files based on htslib
6###############################################################################
7# The principal classes defined in this module are:
8#
9# class AlignedSegment  an aligned segment (read)
10#
11# class PileupColumn    a collection of segments (PileupRead) aligned to
12#                       a particular genomic position.
13#
14# class PileupRead      an AlignedSegment aligned to a particular genomic
15#                       position. Contains additional attributes with respect
16#                       to this.
17#
18# Additionally this module defines numerous additional classes that are part
19# of the internal API. These are:
20#
21# Various iterator classes to iterate over alignments in sequential (IteratorRow)
22# or in a stacked fashion (IteratorColumn):
23#
24# class IteratorRow
25# class IteratorRowRegion
26# class IteratorRowHead
27# class IteratorRowAll
28# class IteratorRowAllRefs
29# class IteratorRowSelection
30#
31###############################################################################
32#
33# The MIT License
34#
35# Copyright (c) 2015 Andreas Heger
36#
37# Permission is hereby granted, free of charge, to any person obtaining a
38# copy of this software and associated documentation files (the "Software"),
39# to deal in the Software without restriction, including without limitation
40# the rights to use, copy, modify, merge, publish, distribute, sublicense,
41# and/or sell copies of the Software, and to permit persons to whom the
42# Software is furnished to do so, subject to the following conditions:
43#
44# The above copyright notice and this permission notice shall be included in
45# all copies or substantial portions of the Software.
46#
47# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
48# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
50# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
53# DEALINGS IN THE SOFTWARE.
54#
55###############################################################################
56import re
57import array
58import json
59import string
60import ctypes
61import struct
62
63cimport cython
64from cpython cimport array as c_array
65from cpython.version cimport PY_MAJOR_VERSION
66from cpython cimport PyBytes_FromStringAndSize
67from libc.string cimport strchr
68from cpython cimport array as c_array
69from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
70    INT8_MAX, INT16_MAX, INT32_MAX, \
71    UINT8_MAX, UINT16_MAX, UINT32_MAX
72
73from pysam.libchtslib cimport HTS_IDX_NOCOOR
74from pysam.libcutils cimport force_bytes, force_str, \
75    charptr_to_str, charptr_to_bytes
76from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
77    array_to_qualitystring
78
79# Constants for binary tag conversion
80cdef char * htslib_types = 'cCsSiIf'
81cdef char * parray_types = 'bBhHiIf'
82
83cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
84
85# translation tables
86
87# cigar code to character and vice versa
88cdef char* CODE2CIGAR= "MIDNSHP=XB"
89cdef int NCIGAR_CODES = 10
90
91if IS_PYTHON3:
92    CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
93    maketrans = str.maketrans
94else:
95    CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
96    maketrans = string.maketrans
97
98CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
99
100# names for keys in dictionary representation of an AlignedSegment
101KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
102             "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
103
104#####################################################################
105# C multiplication with wrapping around
106cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
107    return (a * b) & 0xffffffff
108
109
110cdef inline uint8_t tolower(uint8_t ch):
111    if ch >= 65 and ch <= 90:
112        return ch + 32
113    else:
114        return ch
115
116
117cdef inline uint8_t toupper(uint8_t ch):
118    if ch >= 97 and ch <= 122:
119        return ch - 32
120    else:
121        return ch
122
123
124cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
125    if ch == '=':
126        if bam_is_rev(b):
127            return ','
128        else:
129            return '.'
130    else:
131        if bam_is_rev(b):
132            return tolower(ch)
133        else:
134            return toupper(ch)
135
136
137cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
138    cdef uint32_t c
139    if p.qpos < p.b.core.l_qseq:
140        c = bam_get_qual(p.b)[p.qpos]
141    else:
142        c = 0
143    if c < threshold:
144        return True
145    return False
146
147
148cdef inline char map_typecode_htslib_to_python(uint8_t s):
149    """map an htslib typecode to the corresponding python typecode
150    to be used in the struct or array modules."""
151
152    # map type from htslib to python array
153    cdef char * f = strchr(htslib_types, s)
154
155    if f == NULL:
156        return 0
157    return parray_types[f - htslib_types]
158
159
160cdef inline uint8_t map_typecode_python_to_htslib(char s):
161    """determine value type from type code of array"""
162    cdef char * f = strchr(parray_types, s)
163    if f == NULL:
164        return 0
165    return htslib_types[f - parray_types]
166
167
168cdef inline void update_bin(bam1_t * src):
169    if src.core.flag & BAM_FUNMAP:
170        # treat alignment as length of 1 for unmapped reads
171        src.core.bin = hts_reg2bin(
172            src.core.pos,
173            src.core.pos + 1,
174            14,
175            5)
176    elif pysam_get_n_cigar(src):
177        src.core.bin = hts_reg2bin(
178            src.core.pos,
179            bam_endpos(src),
180            14,
181            5)
182    else:
183        src.core.bin = hts_reg2bin(
184            src.core.pos,
185            src.core.pos + 1,
186            14,
187            5)
188
189
190# optional tag data manipulation
191cdef convert_binary_tag(uint8_t * tag):
192    """return bytesize, number of values and array of values
193    in aux_data memory location pointed to by tag."""
194    cdef uint8_t auxtype
195    cdef uint8_t byte_size
196    cdef int32_t nvalues
197    # get byte size
198    auxtype = tag[0]
199    byte_size = aux_type2size(auxtype)
200    tag += 1
201    # get number of values in array
202    nvalues = (<int32_t*>tag)[0]
203    tag += 4
204
205    # define python array
206    cdef c_array.array c_values = array.array(
207        chr(map_typecode_htslib_to_python(auxtype)))
208    c_array.resize(c_values, nvalues)
209
210    # copy data
211    memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
212
213    # no need to check for endian-ness as bam1_core_t fields
214    # and aux_data are in host endian-ness. See sam.c and calls
215    # to swap_data
216    return byte_size, nvalues, c_values
217
218
219cdef inline uint8_t get_tag_typecode(value, value_type=None):
220    """guess type code for a *value*. If *value_type* is None, the type
221    code will be inferred based on the Python type of *value*
222
223    """
224    # 0 is unknown typecode
225    cdef char typecode = 0
226
227    if value_type is None:
228        if isinstance(value, int):
229            if value < 0:
230                if value >= INT8_MIN:
231                    typecode = 'c'
232                elif value >= INT16_MIN:
233                    typecode = 's'
234                elif value >= INT32_MIN:
235                    typecode = 'i'
236            # unsigned ints
237            else:
238                if value <= UINT8_MAX:
239                    typecode = 'C'
240                elif value <= UINT16_MAX:
241                    typecode = 'S'
242                elif value <= UINT32_MAX:
243                    typecode = 'I'
244        elif isinstance(value, float):
245            typecode = 'f'
246        elif isinstance(value, str):
247            typecode = 'Z'
248        elif isinstance(value, bytes):
249            typecode = 'Z'
250        elif isinstance(value, array.array) or \
251                isinstance(value, list) or \
252                isinstance(value, tuple):
253            typecode = 'B'
254    else:
255        if value_type in 'aAsSIcCZidfH':
256            typecode = force_bytes(value_type)[0]
257
258    return typecode
259
260
261cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
262    '''returns the value typecode of a value.
263
264    If max is specified, the appropriate type is returned for a range
265    where value is the minimum.
266
267    Note that this method returns types from the extended BAM alphabet
268    of types that includes tags that are not part of the SAM
269    specification.
270    '''
271
272
273    cdef uint8_t typecode
274
275    t = type(value)
276
277    if t is float:
278        typecode = 'f'
279    elif t is int:
280        if max_value is None:
281            max_value = value
282        if min_value is None:
283            min_value = value
284        # signed ints
285        if min_value < 0:
286            if min_value >= INT8_MIN and max_value <= INT8_MAX:
287                typecode = 'c'
288            elif min_value >= INT16_MIN and max_value <= INT16_MAX:
289                typecode = 's'
290            elif min_value >= INT32_MIN or max_value <= INT32_MAX:
291                typecode = 'i'
292            else:
293                raise ValueError(
294                    "at least one signed integer out of range of "
295                    "BAM/SAM specification")
296        # unsigned ints
297        else:
298            if max_value <= UINT8_MAX:
299                typecode = 'C'
300            elif max_value <= UINT16_MAX:
301                typecode = 'S'
302            elif max_value <= UINT32_MAX:
303                typecode = 'I'
304            else:
305                raise ValueError(
306                    "at least one integer out of range of BAM/SAM specification")
307    else:
308        # Note: hex strings (H) are not supported yet
309        if t is not bytes:
310            value = value.encode('ascii')
311        if len(value) == 1:
312            typecode = 'A'
313        else:
314            typecode = 'Z'
315
316    return typecode
317
318
319# mapping python array.array and htslib typecodes to struct typecodes
320DATATYPE2FORMAT = {
321    ord('c'): ('b', 1),
322    ord('C'): ('B', 1),
323    ord('s'): ('h', 2),
324    ord('S'): ('H', 2),
325    ord('i'): ('i', 4),
326    ord('I'): ('I', 4),
327    ord('f'): ('f', 4),
328    ord('d'): ('d', 8),
329    ord('A'): ('c', 1),
330    ord('a'): ('c', 1)}
331
332
333cdef inline pack_tags(tags):
334    """pack a list of tags. Each tag is a tuple of (tag, tuple).
335
336    Values are packed into the most space efficient data structure
337    possible unless the tag contains a third field with the typecode.
338
339    Returns a format string and the associated list of arguments to be
340    used in a call to struct.pack_into.
341    """
342    fmts, args = ["<"], []
343
344    # htslib typecode
345    cdef uint8_t typecode
346    for tag in tags:
347
348        if len(tag) == 2:
349            pytag, value = tag
350            valuetype = None
351        elif len(tag) == 3:
352            pytag, value, valuetype = tag
353        else:
354            raise ValueError("malformatted tag: %s" % str(tag))
355
356        if valuetype is None:
357            typecode = 0
358        else:
359            # only first character in valuecode matters
360            if IS_PYTHON3:
361                typecode = force_bytes(valuetype)[0]
362            else:
363                typecode = ord(valuetype[0])
364
365        pytag = force_bytes(pytag)
366        pytype = type(value)
367
368        if pytype is tuple or pytype is list:
369            # binary tags from tuples or lists
370            if not typecode:
371                # automatically determine value type - first value
372                # determines type. If there is a mix of types, the
373                # result is undefined.
374                typecode = get_btag_typecode(min(value),
375                                             min_value=min(value),
376                                             max_value=max(value))
377
378            if typecode not in DATATYPE2FORMAT:
379                raise ValueError("invalid value type '{}'".format(chr(typecode)))
380
381            datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
382            args.extend([pytag[:2],
383                         ord("B"),
384                         typecode,
385                         len(value)] + list(value))
386
387        elif isinstance(value, array.array):
388            # binary tags from arrays
389            if typecode == 0:
390                typecode = map_typecode_python_to_htslib(ord(value.typecode))
391
392                if typecode == 0:
393                    raise ValueError("unsupported type code '{}'".format(value.typecode))
394
395            if typecode not in DATATYPE2FORMAT:
396                raise ValueError("invalid value type '{}' ({})".format(chr(typecode), array.typecode))
397
398            # use array.tostring() to retrieve byte representation and
399            # save as bytes
400            datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
401            if IS_PYTHON3:
402                args.extend([pytag[:2],
403                             ord("B"),
404                             typecode,
405                             len(value),
406                             value.tobytes()])
407            else:
408                args.extend([pytag[:2],
409                             ord("B"),
410                             typecode,
411                             len(value),
412                             force_bytes(value.tostring())])
413
414        else:
415            if typecode == 0:
416                typecode = get_tag_typecode(value)
417                if typecode == 0:
418                    raise ValueError("could not deduce typecode for value {}".format(value))
419
420            if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H':
421                value = force_bytes(value)
422
423            if typecode == "a":
424                typecode = 'A'
425
426            if typecode == 'Z' or typecode == 'H':
427                datafmt = "2sB%is" % (len(value)+1)
428            else:
429                datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
430
431            args.extend([pytag[:2],
432                         typecode,
433                         value])
434
435        fmts.append(datafmt)
436
437    return "".join(fmts), args
438
439
440cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
441    """return query length computed from CIGAR alignment.
442
443    Length ignores hard-clipped bases.
444
445    Return 0 if there is no CIGAR alignment.
446    """
447
448    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
449
450    if cigar_p == NULL:
451        return 0
452
453    cdef uint32_t k, qpos
454    cdef int op
455    qpos = 0
456
457    for k from 0 <= k < pysam_get_n_cigar(src):
458        op = cigar_p[k] & BAM_CIGAR_MASK
459
460        if op == BAM_CMATCH or \
461           op == BAM_CINS or \
462           op == BAM_CSOFT_CLIP or \
463           op == BAM_CEQUAL or \
464           op == BAM_CDIFF:
465            qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
466
467    return qpos
468
469
470cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
471    """return query length computed from CIGAR alignment.
472
473    Length includes hard-clipped bases.
474
475    Return 0 if there is no CIGAR alignment.
476    """
477
478    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
479
480    if cigar_p == NULL:
481        return 0
482
483    cdef uint32_t k, qpos
484    cdef int op
485    qpos = 0
486
487    for k from 0 <= k < pysam_get_n_cigar(src):
488        op = cigar_p[k] & BAM_CIGAR_MASK
489
490        if op == BAM_CMATCH or \
491           op == BAM_CINS or \
492           op == BAM_CSOFT_CLIP or \
493           op == BAM_CHARD_CLIP or \
494           op == BAM_CEQUAL or \
495           op == BAM_CDIFF:
496            qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
497
498    return qpos
499
500
501cdef inline int32_t getQueryStart(bam1_t *src) except -1:
502    cdef uint32_t * cigar_p
503    cdef uint32_t start_offset = 0
504    cdef uint32_t k, op
505
506    cigar_p = pysam_bam_get_cigar(src);
507    for k from 0 <= k < pysam_get_n_cigar(src):
508        op = cigar_p[k] & BAM_CIGAR_MASK
509        if op == BAM_CHARD_CLIP:
510            if start_offset != 0 and start_offset != src.core.l_qseq:
511                raise ValueError('Invalid clipping in CIGAR string')
512        elif op == BAM_CSOFT_CLIP:
513            start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
514        else:
515            break
516
517    return start_offset
518
519
520cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
521    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
522    cdef uint32_t end_offset = src.core.l_qseq
523    cdef uint32_t k, op
524
525    # if there is no sequence, compute length from cigar string
526    if end_offset == 0:
527        for k from 0 <= k < pysam_get_n_cigar(src):
528            op = cigar_p[k] & BAM_CIGAR_MASK
529            if op == BAM_CMATCH or \
530               op == BAM_CINS or \
531               op == BAM_CEQUAL or \
532               op == BAM_CDIFF or \
533              (op == BAM_CSOFT_CLIP and end_offset == 0):
534                end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
535    else:
536        # walk backwards in cigar string
537        for k from pysam_get_n_cigar(src) > k >= 1:
538            op = cigar_p[k] & BAM_CIGAR_MASK
539            if op == BAM_CHARD_CLIP:
540                if end_offset != src.core.l_qseq:
541                    raise ValueError('Invalid clipping in CIGAR string')
542            elif op == BAM_CSOFT_CLIP:
543                end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
544            else:
545                break
546
547    return end_offset
548
549
550cdef inline bytes getSequenceInRange(bam1_t *src,
551                                     uint32_t start,
552                                     uint32_t end):
553    """return python string of the sequence in a bam1_t object.
554    """
555
556    cdef uint8_t * p
557    cdef uint32_t k
558    cdef char * s
559
560    if not src.core.l_qseq:
561        return None
562
563    seq = PyBytes_FromStringAndSize(NULL, end - start)
564    s   = <char*>seq
565    p   = pysam_bam_get_seq(src)
566
567    for k from start <= k < end:
568        # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
569        # note: do not use string literal as it will be a python string
570        s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
571
572    return charptr_to_bytes(seq)
573
574
575cdef inline object getQualitiesInRange(bam1_t *src,
576                                       uint32_t start,
577                                       uint32_t end):
578    """return python array of quality values from a bam1_t object"""
579
580    cdef uint8_t * p
581    cdef uint32_t k
582
583    p = pysam_bam_get_qual(src)
584    if p[0] == 0xff:
585        return None
586
587    # 'B': unsigned char
588    cdef c_array.array result = array.array('B', [0])
589    c_array.resize(result, end - start)
590
591    # copy data
592    memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
593
594    return result
595
596
597#####################################################################
598## factory methods for instantiating extension classes
599cdef class AlignedSegment
600cdef AlignedSegment makeAlignedSegment(bam1_t *src,
601                                       AlignmentHeader header):
602    '''return an AlignedSegment object constructed from `src`'''
603    # note that the following does not call __init__
604    cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
605    dest._delegate = bam_dup1(src)
606    dest.header = header
607    return dest
608
609
610cdef class PileupColumn
611cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
612                      int tid,
613                      int pos,
614                      int n_pu,
615                      uint32_t min_base_quality,
616                      char * reference_sequence,
617                      AlignmentHeader header):
618    '''return a PileupColumn object constructed from pileup in `plp` and
619    setting additional attributes.
620
621    '''
622    # note that the following does not call __init__
623    cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
624    dest.header = header
625    dest.plp = plp
626    dest.tid = tid
627    dest.pos = pos
628    dest.n_pu = n_pu
629    dest.min_base_quality = min_base_quality
630    dest.reference_sequence = reference_sequence
631    dest.buf.l = dest.buf.m = 0
632    dest.buf.s = NULL
633
634    return dest
635
636
637cdef class PileupRead
638cdef PileupRead makePileupRead(const bam_pileup1_t *src,
639                               AlignmentHeader header):
640    '''return a PileupRead object construted from a bam_pileup1_t * object.'''
641    # note that the following does not call __init__
642    cdef PileupRead dest = PileupRead.__new__(PileupRead)
643    dest._alignment = makeAlignedSegment(src.b, header)
644    dest._qpos = src.qpos
645    dest._indel = src.indel
646    dest._level = src.level
647    dest._is_del = src.is_del
648    dest._is_head = src.is_head
649    dest._is_tail = src.is_tail
650    dest._is_refskip = src.is_refskip
651    return dest
652
653
654cdef inline uint32_t get_alignment_length(bam1_t *src):
655    cdef uint32_t k = 0
656    cdef uint32_t l = 0
657    if src == NULL:
658        return 0
659    cdef uint32_t * cigar_p = bam_get_cigar(src)
660    if cigar_p == NULL:
661        return 0
662    cdef int op
663    cdef uint32_t n = pysam_get_n_cigar(src)
664    for k from 0 <= k < n:
665        op = cigar_p[k] & BAM_CIGAR_MASK
666        if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
667            continue
668        l += cigar_p[k] >> BAM_CIGAR_SHIFT
669    return l
670
671
672cdef inline uint32_t get_md_reference_length(char * md_tag):
673    cdef int l = 0
674    cdef int md_idx = 0
675    cdef int nmatches = 0
676
677    while md_tag[md_idx] != 0:
678        if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
679            nmatches *= 10
680            nmatches += md_tag[md_idx] - 48
681            md_idx += 1
682            continue
683        else:
684            l += nmatches
685            nmatches = 0
686            if md_tag[md_idx] == '^':
687                md_idx += 1
688                while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
689                    md_idx += 1
690                    l += 1
691            else:
692                md_idx += 1
693                l += 1
694
695    l += nmatches
696    return l
697
698# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
699cdef inline bytes build_alignment_sequence(bam1_t * src):
700    """return expanded sequence from MD tag.
701
702    The sequence includes substitutions and both insertions in the
703    reference as well as deletions to the reference sequence. Combine
704    with the cigar string to reconstitute the query or the reference
705    sequence.
706
707    Positions corresponding to `N` (skipped region from the reference)
708    in the CIGAR string will not appear in the returned sequence. The
709    MD should correspondingly not contain these. Thus proper tags are::
710
711       Deletion from the reference:   cigar=5M1D5M    MD=5^C5
712       Skipped region from reference: cigar=5M1N5M    MD=10
713
714    Returns
715    -------
716
717    None, if no MD tag is present.
718
719    """
720    if src == NULL:
721        return None
722
723    cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
724    if md_tag_ptr == NULL:
725        return None
726
727    cdef uint32_t start = getQueryStart(src)
728    cdef uint32_t end = getQueryEnd(src)
729    # get read sequence, taking into account soft-clipping
730    r = getSequenceInRange(src, start, end)
731    cdef char * read_sequence = r
732    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
733    if cigar_p == NULL:
734        return None
735
736    cdef uint32_t r_idx = 0
737    cdef int op
738    cdef uint32_t k, i, l, x
739    cdef int nmatches = 0
740    cdef int s_idx = 0
741
742    cdef uint32_t max_len = get_alignment_length(src)
743    if max_len == 0:
744        raise ValueError("could not determine alignment length")
745
746    cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
747    if s == NULL:
748        raise ValueError(
749            "could not allocate sequence of length %i" % max_len)
750
751    for k from 0 <= k < pysam_get_n_cigar(src):
752        op = cigar_p[k] & BAM_CIGAR_MASK
753        l = cigar_p[k] >> BAM_CIGAR_SHIFT
754        if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
755            for i from 0 <= i < l:
756                s[s_idx] = read_sequence[r_idx]
757                r_idx += 1
758                s_idx += 1
759        elif op == BAM_CDEL:
760            for i from 0 <= i < l:
761                s[s_idx] = '-'
762                s_idx += 1
763        elif op == BAM_CREF_SKIP:
764            pass
765        elif op == BAM_CINS:
766            for i from 0 <= i < l:
767                # encode insertions into reference as lowercase
768                s[s_idx] = read_sequence[r_idx] + 32
769                r_idx += 1
770                s_idx += 1
771        elif op == BAM_CSOFT_CLIP:
772            pass
773        elif op == BAM_CHARD_CLIP:
774            pass # advances neither
775        elif op == BAM_CPAD:
776            raise NotImplementedError(
777                "Padding (BAM_CPAD, 6) is currently not supported. "
778                "Please implement. Sorry about that.")
779
780    cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
781    cdef int md_idx = 0
782    cdef char c
783    s_idx = 0
784
785    # Check if MD tag is valid by matching CIGAR length to MD tag defined length
786    # Insertions would be in addition to what is described by MD, so we calculate
787    # the number of insertions separately.
788    cdef int insertions = 0
789
790    while s[s_idx] != 0:
791        if s[s_idx] >= 'a':
792            insertions += 1
793        s_idx += 1
794    s_idx = 0
795
796    cdef uint32_t md_len = get_md_reference_length(md_tag)
797    if md_len + insertions > max_len:
798        raise AssertionError(
799            "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
800            md_len, max_len, insertions))
801
802    while md_tag[md_idx] != 0:
803        # c is numerical
804        if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
805            nmatches *= 10
806            nmatches += md_tag[md_idx] - 48
807            md_idx += 1
808            continue
809        else:
810            # save matches up to this point, skipping insertions
811            for x from 0 <= x < nmatches:
812                while s[s_idx] >= 'a':
813                    s_idx += 1
814                s_idx += 1
815            while s[s_idx] >= 'a':
816                s_idx += 1
817
818            r_idx += nmatches
819            nmatches = 0
820            if md_tag[md_idx] == '^':
821                md_idx += 1
822                while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
823                    # assert s[s_idx] == '-'
824                    s[s_idx] = md_tag[md_idx]
825                    s_idx += 1
826                    md_idx += 1
827            else:
828                # save mismatch
829                # enforce lower case
830                c = md_tag[md_idx]
831                if c <= 90:
832                    c += 32
833                s[s_idx] = c
834                s_idx += 1
835                r_idx += 1
836                md_idx += 1
837
838    # save matches up to this point, skipping insertions
839    for x from 0 <= x < nmatches:
840        while s[s_idx] >= 'a':
841            s_idx += 1
842        s_idx += 1
843    while s[s_idx] >= 'a':
844        s_idx += 1
845
846    seq = PyBytes_FromStringAndSize(s, s_idx)
847    free(s)
848
849    return seq
850
851
852cdef inline bytes build_reference_sequence(bam1_t * src):
853    """return the reference sequence in the region that is covered by the
854    alignment of the read to the reference.
855
856    This method requires the MD tag to be set.
857
858    """
859    cdef uint32_t k, i, l
860    cdef int op
861    cdef int s_idx = 0
862    ref_seq = build_alignment_sequence(src)
863    if ref_seq is None:
864        raise ValueError("MD tag not present")
865
866    cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
867    if s == NULL:
868        raise ValueError(
869            "could not allocate sequence of length %i" % len(ref_seq))
870
871    cdef char * cref_seq = ref_seq
872    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
873    cdef uint32_t r_idx = 0
874    for k from 0 <= k < pysam_get_n_cigar(src):
875        op = cigar_p[k] & BAM_CIGAR_MASK
876        l = cigar_p[k] >> BAM_CIGAR_SHIFT
877        if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
878            for i from 0 <= i < l:
879                s[s_idx] = cref_seq[r_idx]
880                r_idx += 1
881                s_idx += 1
882        elif op == BAM_CDEL:
883            for i from 0 <= i < l:
884                s[s_idx] = cref_seq[r_idx]
885                r_idx += 1
886                s_idx += 1
887        elif op == BAM_CREF_SKIP:
888            pass
889        elif op == BAM_CINS:
890            r_idx += l
891        elif op == BAM_CSOFT_CLIP:
892            pass
893        elif op == BAM_CHARD_CLIP:
894            pass # advances neither
895        elif op == BAM_CPAD:
896            raise NotImplementedError(
897                "Padding (BAM_CPAD, 6) is currently not supported. "
898                "Please implement. Sorry about that.")
899
900    seq = PyBytes_FromStringAndSize(s, s_idx)
901    free(s)
902
903    return seq
904
905
906cdef class AlignedSegment:
907    '''Class representing an aligned segment.
908
909    This class stores a handle to the samtools C-structure representing
910    an aligned read. Member read access is forwarded to the C-structure
911    and converted into python objects. This implementation should be fast,
912    as only the data needed is converted.
913
914    For write access, the C-structure is updated in-place. This is
915    not the most efficient way to build BAM entries, as the variable
916    length data is concatenated and thus needs to be resized if
917    a field is updated. Furthermore, the BAM entry might be
918    in an inconsistent state.
919
920    One issue to look out for is that the sequence should always
921    be set *before* the quality scores. Setting the sequence will
922    also erase any quality scores that were set previously.
923
924    Parameters
925    ----------
926
927    header:
928         :class:`~pysam.AlignmentHeader` object to map numerical
929         identifiers to chromosome names. If not given, an empty
930         header is created.
931    '''
932
933    # Now only called when instances are created from Python
934    def __init__(self, AlignmentHeader header=None):
935        # see bam_init1
936        self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
937        if self._delegate == NULL:
938            raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
939        # allocate some memory. If size is 0, calloc does not return a
940        # pointer that can be passed to free() so allocate 40 bytes
941        # for a new read
942        self._delegate.m_data = 40
943        self._delegate.data = <uint8_t *>calloc(
944            self._delegate.m_data, 1)
945        if self._delegate.data == NULL:
946            raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
947        self._delegate.l_data = 0
948        # set some data to make read approximately legit.
949        # Note, SAM writing fails with q_name of length 0
950        self._delegate.core.l_qname = 0
951        self._delegate.core.tid = -1
952        self._delegate.core.pos = -1
953        self._delegate.core.mtid = -1
954        self._delegate.core.mpos = -1
955
956        # caching for selected fields
957        self.cache_query_qualities = None
958        self.cache_query_alignment_qualities = None
959        self.cache_query_sequence = None
960        self.cache_query_alignment_sequence = None
961
962        self.header = header
963
964    def __dealloc__(self):
965        bam_destroy1(self._delegate)
966
967    def __str__(self):
968        """return string representation of alignment.
969
970        The representation is an approximate :term:`SAM` format, because
971        an aligned read might not be associated with a :term:`AlignmentFile`.
972        As a result :term:`tid` is shown instead of the reference name.
973        Similarly, the tags field is returned in its parsed state.
974
975        To get a valid SAM record, use :meth:`to_string`.
976        """
977        # sam-parsing is done in sam.c/bam_format1_core which
978        # requires a valid header.
979        return "\t".join(map(str, (self.query_name,
980                                   self.flag,
981                                   "#%d" % self.reference_id if self.reference_id >= 0 else "*",
982                                   self.reference_start + 1,
983                                   self.mapping_quality,
984                                   self.cigarstring,
985                                   "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*",
986                                   self.next_reference_start + 1,
987                                   self.template_length,
988                                   self.query_sequence,
989                                   self.query_qualities,
990                                   self.tags)))
991
992    def __copy__(self):
993        return makeAlignedSegment(self._delegate, self.header)
994
995    def __deepcopy__(self, memo):
996        return makeAlignedSegment(self._delegate, self.header)
997
998    def compare(self, AlignedSegment other):
999        '''return -1,0,1, if contents in this are binary
1000        <,=,> to *other*
1001        '''
1002
1003        # avoid segfault when other equals None
1004        if other is None:
1005            return -1
1006
1007        cdef int retval, x
1008        cdef bam1_t *t
1009        cdef bam1_t *o
1010
1011        t = self._delegate
1012        o = other._delegate
1013
1014        # uncomment for debugging purposes
1015        # cdef unsigned char * oo, * tt
1016        # tt = <unsigned char*>(&t.core)
1017        # oo = <unsigned char*>(&o.core)
1018        # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1019        # tt = <unsigned char*>(t.data)
1020        # oo = <unsigned char*>(o.data)
1021        # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1022
1023        # Fast-path test for object identity
1024        if t == o:
1025            return 0
1026
1027        cdef uint8_t *a = <uint8_t*>&t.core
1028        cdef uint8_t *b = <uint8_t*>&o.core
1029
1030        retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1031        if retval:
1032            return retval
1033
1034        # cmp(t.l_data, o.l_data)
1035        retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
1036        if retval:
1037            return retval
1038        return memcmp(t.data, o.data, t.l_data)
1039
1040    def __richcmp__(self, AlignedSegment other, int op):
1041        if op == 2:  # == operator
1042            return self.compare(other) == 0
1043        elif op == 3:  # != operator
1044            return self.compare(other) != 0
1045        else:
1046            return NotImplemented
1047
1048    def __hash__(self):
1049        cdef bam1_t * src = self._delegate
1050        cdef int x
1051
1052        # see http://effbot.org/zone/python-hash.htm
1053        cdef uint8_t * c = <uint8_t *>&src.core
1054        cdef uint32_t hash_value = c[0]
1055        for x from 1 <= x < sizeof(bam1_core_t):
1056            hash_value = c_mul(hash_value, 1000003) ^ c[x]
1057        c = <uint8_t *>src.data
1058        for x from 0 <= x < src.l_data:
1059            hash_value = c_mul(hash_value, 1000003) ^ c[x]
1060
1061        return hash_value
1062
1063    cpdef to_string(self):
1064        """returns a string representation of the aligned segment.
1065
1066        The output format is valid SAM format if a header is associated
1067        with the AlignedSegment.
1068        """
1069        cdef kstring_t line
1070        line.l = line.m = 0
1071        line.s = NULL
1072
1073        if self.header:
1074            if sam_format1(self.header.ptr, self._delegate, &line) < 0:
1075                if line.m:
1076                    free(line.s)
1077                raise ValueError('sam_format failed')
1078        else:
1079            raise NotImplementedError("todo")
1080
1081        ret = force_str(line.s[:line.l])
1082
1083        if line.m:
1084            free(line.s)
1085
1086        return ret
1087
1088    @classmethod
1089    def fromstring(cls, sam, AlignmentHeader header):
1090        """parses a string representation of the aligned segment.
1091
1092        The input format should be valid SAM format.
1093
1094        Parameters
1095        ----------
1096        sam:
1097            :term:`SAM` formatted string
1098
1099        """
1100        cdef AlignedSegment dest = cls.__new__(cls)
1101        dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
1102        dest.header = header
1103
1104        cdef kstring_t line
1105        line.l = line.m = len(sam)
1106        _sam = force_bytes(sam)
1107        line.s = _sam
1108
1109        sam_parse1(&line, dest.header.ptr, dest._delegate)
1110
1111        return dest
1112
1113    cpdef tostring(self, htsfile=None):
1114        """deprecated, use :meth:`to_string()` instead.
1115
1116        Parameters
1117        ----------
1118
1119        htsfile:
1120            (deprecated) AlignmentFile object to map numerical
1121            identifiers to chromosome names. This parameter is present
1122            for backwards compatibility and ignored.
1123        """
1124
1125        return self.to_string()
1126
1127    def to_dict(self):
1128        """returns a json representation of the aligned segment.
1129
1130        Field names are abbreviated versions of the class attributes.
1131        """
1132        # let htslib do the string conversions, but treat optional field properly as list
1133        vals = self.to_string().split("\t")
1134        n = len(KEY_NAMES) - 1
1135        return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
1136
1137    @classmethod
1138    def from_dict(cls, sam_dict, AlignmentHeader header):
1139        """parses a dictionary representation of the aligned segment.
1140
1141        Parameters
1142        ----------
1143        sam_dict:
1144            dictionary of alignment values, keys corresponding to output from
1145            :meth:`todict()`.
1146
1147        """
1148        # let htslib do the parsing
1149        # the tags field can be missing
1150        return cls.fromstring(
1151            "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
1152            "\t" +
1153            "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
1154
1155    ########################################################
1156    ## Basic attributes in order of appearance in SAM format
1157    property query_name:
1158        """the query template name (None if not present)"""
1159        def __get__(self):
1160
1161            cdef bam1_t * src = self._delegate
1162            if src.core.l_qname == 0:
1163                return None
1164
1165            return charptr_to_str(<char *>pysam_bam_get_qname(src))
1166
1167        def __set__(self, qname):
1168
1169            if qname is None or len(qname) == 0:
1170                return
1171
1172            if len(qname) > 254:
1173                raise ValueError("query length out of range {} > 254".format(
1174                    len(qname)))
1175
1176            qname = force_bytes(qname)
1177            cdef bam1_t * src = self._delegate
1178            # the qname is \0 terminated
1179            cdef uint8_t l = len(qname) + 1
1180
1181            cdef char * p = pysam_bam_get_qname(src)
1182            cdef uint8_t l_extranul = 0
1183
1184            if l % 4 != 0:
1185                l_extranul = 4 - l % 4
1186
1187            cdef bam1_t * retval = pysam_bam_update(src,
1188                                                    src.core.l_qname,
1189                                                    l + l_extranul,
1190                                                    <uint8_t*>p)
1191            if retval == NULL:
1192                raise MemoryError("could not allocate memory")
1193
1194            src.core.l_extranul = l_extranul
1195            src.core.l_qname = l + l_extranul
1196
1197            # re-acquire pointer to location in memory
1198            # as it might have moved
1199            p = pysam_bam_get_qname(src)
1200
1201            strncpy(p, qname, l)
1202            # x might be > 255
1203            cdef uint16_t x = 0
1204
1205            for x from l <= x < l + l_extranul:
1206                p[x] = '\0'
1207
1208    property flag:
1209        """properties flag"""
1210        def __get__(self):
1211            return self._delegate.core.flag
1212        def __set__(self, flag):
1213            self._delegate.core.flag = flag
1214
1215    property reference_name:
1216        """:term:`reference` name"""
1217        def __get__(self):
1218            if self._delegate.core.tid == -1:
1219                return None
1220            if self.header:
1221                return self.header.get_reference_name(self._delegate.core.tid)
1222            else:
1223                raise ValueError("reference_name unknown if no header associated with record")
1224        def __set__(self, reference):
1225            cdef int tid
1226            if reference is None or reference == "*":
1227                self._delegate.core.tid = -1
1228            elif self.header:
1229                tid = self.header.get_tid(reference)
1230                if tid < 0:
1231                    raise ValueError("reference {} does not exist in header".format(
1232                        reference))
1233                self._delegate.core.tid = tid
1234            else:
1235                raise ValueError("reference_name can not be set if no header associated with record")
1236
1237    property reference_id:
1238        """:term:`reference` ID
1239
1240        .. note::
1241
1242            This field contains the index of the reference sequence in
1243            the sequence dictionary. To obtain the name of the
1244            reference sequence, use :meth:`get_reference_name()`
1245
1246        """
1247        def __get__(self):
1248            return self._delegate.core.tid
1249        def __set__(self, tid):
1250            if tid != -1 and self.header and not self.header.is_valid_tid(tid):
1251                raise ValueError("reference id {} does not exist in header".format(
1252                    tid))
1253            self._delegate.core.tid = tid
1254
1255    property reference_start:
1256        """0-based leftmost coordinate"""
1257        def __get__(self):
1258            return self._delegate.core.pos
1259        def __set__(self, pos):
1260            ## setting the position requires updating the "bin" attribute
1261            cdef bam1_t * src
1262            src = self._delegate
1263            src.core.pos = pos
1264            update_bin(src)
1265
1266    property mapping_quality:
1267        """mapping quality"""
1268        def __get__(self):
1269            return pysam_get_qual(self._delegate)
1270        def __set__(self, qual):
1271            pysam_set_qual(self._delegate, qual)
1272
1273    property cigarstring:
1274        '''the :term:`cigar` alignment as a string.
1275
1276        The cigar string is a string of alternating integers
1277        and characters denoting the length and the type of
1278        an operation.
1279
1280        .. note::
1281            The order length,operation is specified in the
1282            SAM format. It is different from the order of
1283            the :attr:`cigar` property.
1284
1285        Returns None if not present.
1286
1287        To unset the cigarstring, assign None or the
1288        empty string.
1289        '''
1290        def __get__(self):
1291            c = self.cigartuples
1292            if c is None:
1293                return None
1294            # reverse order
1295            else:
1296                return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
1297
1298        def __set__(self, cigar):
1299            if cigar is None or len(cigar) == 0:
1300                self.cigartuples = []
1301            else:
1302                parts = CIGAR_REGEX.findall(cigar)
1303                # reverse order
1304                self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
1305
1306    # TODO
1307    # property cigar:
1308    #     """the cigar alignment"""
1309
1310    property next_reference_id:
1311        """the :term:`reference` id of the mate/next read."""
1312        def __get__(self):
1313            return self._delegate.core.mtid
1314        def __set__(self, mtid):
1315            if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
1316                raise ValueError("reference id {} does not exist in header".format(
1317                    mtid))
1318            self._delegate.core.mtid = mtid
1319
1320    property next_reference_name:
1321        """:term:`reference` name of the mate/next read (None if no
1322        AlignmentFile is associated)"""
1323        def __get__(self):
1324            if self._delegate.core.mtid == -1:
1325                return None
1326            if self.header:
1327                return self.header.get_reference_name(self._delegate.core.mtid)
1328            else:
1329                raise ValueError("next_reference_name unknown if no header associated with record")
1330
1331        def __set__(self, reference):
1332            cdef int mtid
1333            if reference is None or reference == "*":
1334                self._delegate.core.mtid = -1
1335            elif reference == "=":
1336                self._delegate.core.mtid = self._delegate.core.tid
1337            elif self.header:
1338                mtid = self.header.get_tid(reference)
1339                if mtid < 0:
1340                    raise ValueError("reference {} does not exist in header".format(
1341                        reference))
1342                self._delegate.core.mtid = mtid
1343            else:
1344                raise ValueError("next_reference_name can not be set if no header associated with record")
1345
1346    property next_reference_start:
1347        """the position of the mate/next read."""
1348        def __get__(self):
1349            return self._delegate.core.mpos
1350        def __set__(self, mpos):
1351            self._delegate.core.mpos = mpos
1352
1353    property query_length:
1354        """the length of the query/read.
1355
1356        This value corresponds to the length of the sequence supplied
1357        in the BAM/SAM file. The length of a query is 0 if there is no
1358        sequence in the BAM/SAM file. In those cases, the read length
1359        can be inferred from the CIGAR alignment, see
1360        :meth:`pysam.AlignedSegment.infer_query_length`.
1361
1362        The length includes soft-clipped bases and is equal to
1363        ``len(query_sequence)``.
1364
1365        This property is read-only but can be set by providing a
1366        sequence.
1367
1368        Returns 0 if not available.
1369
1370        """
1371        def __get__(self):
1372            return self._delegate.core.l_qseq
1373
1374    property template_length:
1375        """the observed query template length"""
1376        def __get__(self):
1377            return self._delegate.core.isize
1378        def __set__(self, isize):
1379            self._delegate.core.isize = isize
1380
1381    property query_sequence:
1382        """read sequence bases, including :term:`soft clipped` bases
1383        (None if not present).
1384
1385        Note that assigning to seq will invalidate any quality scores.
1386        Thus, to in-place edit the sequence and quality scores, copies of
1387        the quality scores need to be taken. Consider trimming for example::
1388
1389           q = read.query_qualities
1390           read.query_squence = read.query_sequence[5:10]
1391           read.query_qualities = q[5:10]
1392
1393        The sequence is returned as it is stored in the BAM file. (This will
1394        be the reverse complement of the original read sequence if the mapper
1395        has aligned the read to the reverse strand.)
1396        """
1397        def __get__(self):
1398            if self.cache_query_sequence:
1399                return self.cache_query_sequence
1400
1401            cdef bam1_t * src
1402            cdef char * s
1403            src = self._delegate
1404
1405            if src.core.l_qseq == 0:
1406                return None
1407
1408            self.cache_query_sequence = force_str(getSequenceInRange(
1409                src, 0, src.core.l_qseq))
1410            return self.cache_query_sequence
1411
1412        def __set__(self, seq):
1413            # samtools manages sequence and quality length memory together
1414            # if no quality information is present, the first byte says 0xff.
1415            cdef bam1_t * src
1416            cdef uint8_t * p
1417            cdef char * s
1418            cdef int l, k
1419            cdef Py_ssize_t nbytes_new, nbytes_old
1420
1421            if seq == None:
1422                l = 0
1423            else:
1424                l = len(seq)
1425                seq = force_bytes(seq)
1426
1427            src = self._delegate
1428
1429            # as the sequence is stored in half-bytes, the total length (sequence
1430            # plus quality scores) is (l+1)/2 + l
1431            nbytes_new = (l + 1) / 2 + l
1432            nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
1433
1434            # acquire pointer to location in memory
1435            p = pysam_bam_get_seq(src)
1436            src.core.l_qseq = l
1437
1438            # change length of data field
1439            cdef bam1_t * retval = pysam_bam_update(src,
1440                                                    nbytes_old,
1441                                                    nbytes_new,
1442                                                    p)
1443
1444            if retval == NULL:
1445                raise MemoryError("could not allocate memory")
1446
1447            if l > 0:
1448                # re-acquire pointer to location in memory
1449                # as it might have moved
1450                p = pysam_bam_get_seq(src)
1451                for k from 0 <= k < nbytes_new:
1452                    p[k] = 0
1453                # convert to C string
1454                s = seq
1455                for k from 0 <= k < l:
1456                    p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
1457
1458                # erase qualities
1459                p = pysam_bam_get_qual(src)
1460                p[0] = 0xff
1461
1462            self.cache_query_sequence = force_str(seq)
1463
1464            # clear cached values for quality values
1465            self.cache_query_qualities = None
1466            self.cache_query_alignment_qualities = None
1467
1468    property query_qualities:
1469        """read sequence base qualities, including :term:`soft
1470        clipped` bases (None if not present).
1471
1472        Quality scores are returned as a python array of unsigned
1473        chars. Note that this is not the ASCII-encoded value typically
1474        seen in FASTQ or SAM formatted files. Thus, no offset of 33
1475        needs to be subtracted.
1476
1477        Note that to set quality scores the sequence has to be set
1478        beforehand as this will determine the expected length of the
1479        quality score array.
1480
1481        This method raises a ValueError if the length of the
1482        quality scores and the sequence are not the same.
1483
1484        """
1485        def __get__(self):
1486
1487            if self.cache_query_qualities:
1488                return self.cache_query_qualities
1489
1490            cdef bam1_t * src
1491            cdef char * q
1492
1493            src = self._delegate
1494
1495            if src.core.l_qseq == 0:
1496                return None
1497
1498            self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
1499            return self.cache_query_qualities
1500
1501        def __set__(self, qual):
1502
1503            # note that memory is already allocated via setting the sequence
1504            # hence length match of sequence and quality needs is checked.
1505            cdef bam1_t * src
1506            cdef uint8_t * p
1507            cdef int l
1508
1509            src = self._delegate
1510            p = pysam_bam_get_qual(src)
1511            if qual is None or len(qual) == 0:
1512                # if absent and there is a sequence: set to 0xff
1513                if src.core.l_qseq != 0:
1514                    p[0] = 0xff
1515                return
1516
1517            # check for length match
1518            l = len(qual)
1519            if src.core.l_qseq != l:
1520                raise ValueError(
1521                    "quality and sequence mismatch: %i != %i" %
1522                    (l, src.core.l_qseq))
1523
1524            # create a python array object filling it
1525            # with the quality scores
1526
1527            # NB: should avoid this copying if qual is
1528            # already of the correct type.
1529            cdef c_array.array result = c_array.array('B', qual)
1530
1531            # copy data
1532            memcpy(p, result.data.as_voidptr, l)
1533
1534            # save in cache
1535            self.cache_query_qualities = qual
1536
1537    property bin:
1538        """properties bin"""
1539        def __get__(self):
1540            return self._delegate.core.bin
1541        def __set__(self, bin):
1542            self._delegate.core.bin = bin
1543
1544
1545    ##########################################################
1546    # Derived simple attributes. These are simple attributes of
1547    # AlignedSegment getting and setting values.
1548    ##########################################################
1549    # 1. Flags
1550    ##########################################################
1551    property is_paired:
1552        """true if read is paired in sequencing"""
1553        def __get__(self):
1554            return (self.flag & BAM_FPAIRED) != 0
1555        def __set__(self,val):
1556            pysam_update_flag(self._delegate, val, BAM_FPAIRED)
1557
1558    property is_proper_pair:
1559        """true if read is mapped in a proper pair"""
1560        def __get__(self):
1561            return (self.flag & BAM_FPROPER_PAIR) != 0
1562        def __set__(self,val):
1563            pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
1564    property is_unmapped:
1565        """true if read itself is unmapped"""
1566        def __get__(self):
1567            return (self.flag & BAM_FUNMAP) != 0
1568        def __set__(self, val):
1569            pysam_update_flag(self._delegate, val, BAM_FUNMAP)
1570            # setting the unmapped flag requires recalculation of
1571            # bin as alignment length is now implicitly 1
1572            update_bin(self._delegate)
1573
1574    property mate_is_unmapped:
1575        """true if the mate is unmapped"""
1576        def __get__(self):
1577            return (self.flag & BAM_FMUNMAP) != 0
1578        def __set__(self,val):
1579            pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
1580    property is_reverse:
1581        """true if read is mapped to reverse strand"""
1582        def __get__(self):
1583            return (self.flag & BAM_FREVERSE) != 0
1584        def __set__(self,val):
1585            pysam_update_flag(self._delegate, val, BAM_FREVERSE)
1586    property mate_is_reverse:
1587        """true is read is mapped to reverse strand"""
1588        def __get__(self):
1589            return (self.flag & BAM_FMREVERSE) != 0
1590        def __set__(self,val):
1591            pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
1592    property is_read1:
1593        """true if this is read1"""
1594        def __get__(self):
1595            return (self.flag & BAM_FREAD1) != 0
1596        def __set__(self,val):
1597            pysam_update_flag(self._delegate, val, BAM_FREAD1)
1598    property is_read2:
1599        """true if this is read2"""
1600        def __get__(self):
1601            return (self.flag & BAM_FREAD2) != 0
1602        def __set__(self, val):
1603            pysam_update_flag(self._delegate, val, BAM_FREAD2)
1604    property is_secondary:
1605        """true if not primary alignment"""
1606        def __get__(self):
1607            return (self.flag & BAM_FSECONDARY) != 0
1608        def __set__(self, val):
1609            pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
1610    property is_qcfail:
1611        """true if QC failure"""
1612        def __get__(self):
1613            return (self.flag & BAM_FQCFAIL) != 0
1614        def __set__(self, val):
1615            pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
1616    property is_duplicate:
1617        """true if optical or PCR duplicate"""
1618        def __get__(self):
1619            return (self.flag & BAM_FDUP) != 0
1620        def __set__(self, val):
1621            pysam_update_flag(self._delegate, val, BAM_FDUP)
1622    property is_supplementary:
1623        """true if this is a supplementary alignment"""
1624        def __get__(self):
1625            return (self.flag & BAM_FSUPPLEMENTARY) != 0
1626        def __set__(self, val):
1627            pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
1628
1629    # 2. Coordinates and lengths
1630    property reference_end:
1631        '''aligned reference position of the read on the reference genome.
1632
1633        reference_end points to one past the last aligned residue.
1634        Returns None if not available (read is unmapped or no cigar
1635        alignment present).
1636
1637        '''
1638        def __get__(self):
1639            cdef bam1_t * src
1640            src = self._delegate
1641            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
1642                return None
1643            return bam_endpos(src)
1644
1645    property reference_length:
1646        '''aligned length of the read on the reference genome.
1647
1648        This is equal to `aend - pos`. Returns None if not available.'''
1649        def __get__(self):
1650            cdef bam1_t * src
1651            src = self._delegate
1652            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
1653                return None
1654            return bam_endpos(src) - \
1655                self._delegate.core.pos
1656
1657    property query_alignment_sequence:
1658        """aligned portion of the read.
1659
1660        This is a substring of :attr:`seq` that excludes flanking
1661        bases that were :term:`soft clipped` (None if not present). It
1662        is equal to ``seq[qstart:qend]``.
1663
1664        SAM/BAM files may include extra flanking bases that are not
1665        part of the alignment.  These bases may be the result of the
1666        Smith-Waterman or other algorithms, which may not require
1667        alignments that begin at the first residue or end at the last.
1668        In addition, extra sequencing adapters, multiplex identifiers,
1669        and low-quality bases that were not considered for alignment
1670        may have been retained.
1671
1672        """
1673
1674        def __get__(self):
1675            if self.cache_query_alignment_sequence:
1676                return self.cache_query_alignment_sequence
1677
1678            cdef bam1_t * src
1679            cdef uint32_t start, end
1680
1681            src = self._delegate
1682
1683            if src.core.l_qseq == 0:
1684                return None
1685
1686            start = getQueryStart(src)
1687            end   = getQueryEnd(src)
1688
1689            self.cache_query_alignment_sequence = force_str(
1690                getSequenceInRange(src, start, end))
1691            return self.cache_query_alignment_sequence
1692
1693    property query_alignment_qualities:
1694        """aligned query sequence quality values (None if not present). These
1695        are the quality values that correspond to :attr:`query`, that
1696        is, they exclude qualities of :term:`soft clipped` bases. This
1697        is equal to ``qual[qstart:qend]``.
1698
1699        Quality scores are returned as a python array of unsigned
1700        chars. Note that this is not the ASCII-encoded value typically
1701        seen in FASTQ or SAM formatted files. Thus, no offset of 33
1702        needs to be subtracted.
1703
1704        This property is read-only.
1705
1706        """
1707        def __get__(self):
1708
1709            if self.cache_query_alignment_qualities:
1710                return self.cache_query_alignment_qualities
1711
1712            cdef bam1_t * src
1713            cdef uint32_t start, end
1714
1715            src = self._delegate
1716
1717            if src.core.l_qseq == 0:
1718                return None
1719
1720            start = getQueryStart(src)
1721            end   = getQueryEnd(src)
1722            self.cache_query_alignment_qualities = \
1723                getQualitiesInRange(src, start, end)
1724            return self.cache_query_alignment_qualities
1725
1726    property query_alignment_start:
1727        """start index of the aligned query portion of the sequence (0-based,
1728        inclusive).
1729
1730        This the index of the first base in :attr:`seq` that is not
1731        soft-clipped.
1732        """
1733        def __get__(self):
1734            return getQueryStart(self._delegate)
1735
1736    property query_alignment_end:
1737        """end index of the aligned query portion of the sequence (0-based,
1738        exclusive)
1739
1740        This the index just past the last base in :attr:`seq` that is not
1741        soft-clipped.
1742        """
1743        def __get__(self):
1744            return getQueryEnd(self._delegate)
1745
1746    property query_alignment_length:
1747        """length of the aligned query sequence.
1748
1749        This is equal to :attr:`qend` - :attr:`qstart`"""
1750        def __get__(self):
1751            cdef bam1_t * src
1752            src = self._delegate
1753            return getQueryEnd(src) - getQueryStart(src)
1754
1755    #####################################################
1756    # Computed properties
1757
1758    def get_reference_positions(self, full_length=False):
1759        """a list of reference positions that this read aligns to.
1760
1761        By default, this method only returns positions in the
1762        reference that are within the alignment. If *full_length* is
1763        set, None values will be included for any soft-clipped or
1764        unaligned positions within the read. The returned list will
1765        thus be of the same length as the read.
1766
1767        """
1768        cdef uint32_t k, i, l, pos
1769        cdef int op
1770        cdef uint32_t * cigar_p
1771        cdef bam1_t * src
1772        cdef bint _full = full_length
1773
1774        src = self._delegate
1775        if pysam_get_n_cigar(src) == 0:
1776            return []
1777
1778        result = []
1779        pos = src.core.pos
1780        cigar_p = pysam_bam_get_cigar(src)
1781
1782        for k from 0 <= k < pysam_get_n_cigar(src):
1783            op = cigar_p[k] & BAM_CIGAR_MASK
1784            l = cigar_p[k] >> BAM_CIGAR_SHIFT
1785
1786            if op == BAM_CSOFT_CLIP or op == BAM_CINS:
1787                if _full:
1788                    for i from 0 <= i < l:
1789                        result.append(None)
1790            elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
1791                for i from pos <= i < pos + l:
1792                    result.append(i)
1793                pos += l
1794            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
1795                pos += l
1796
1797        return result
1798
1799    def infer_query_length(self, always=False):
1800        """infer query length from CIGAR alignment.
1801
1802        This method deduces the query length from the CIGAR alignment
1803        but does not include hard-clipped bases.
1804
1805        Returns None if CIGAR alignment is not present.
1806
1807        If *always* is set to True, `infer_read_length` is used instead.
1808        This is deprecated and only present for backward compatibility.
1809        """
1810        if always is True:
1811            return self.infer_read_length()
1812        cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
1813        if l > 0:
1814            return l
1815        else:
1816            return None
1817
1818    def infer_read_length(self):
1819        """infer read length from CIGAR alignment.
1820
1821        This method deduces the read length from the CIGAR alignment
1822        including hard-clipped bases.
1823
1824        Returns None if CIGAR alignment is not present.
1825        """
1826        cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
1827        if l > 0:
1828            return l
1829        else:
1830            return None
1831
1832    def get_reference_sequence(self):
1833        """return the reference sequence in the region that is covered by the
1834        alignment of the read to the reference.
1835
1836        This method requires the MD tag to be set.
1837
1838        """
1839        return force_str(build_reference_sequence(self._delegate))
1840
1841    def get_forward_sequence(self):
1842        """return the original read sequence.
1843
1844        Reads mapped to the reverse strand are stored reverse complemented in
1845        the BAM file. This method returns such reads reverse complemented back
1846        to their original orientation.
1847
1848        Returns None if the record has no query sequence.
1849        """
1850        if self.query_sequence is None:
1851            return None
1852        s = force_str(self.query_sequence)
1853        if self.is_reverse:
1854            s = s.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
1855        return s
1856
1857    def get_forward_qualities(self):
1858        """return the original base qualities of the read sequence,
1859        in the same format as the :attr:`query_qualities` property.
1860
1861        Reads mapped to the reverse strand have their base qualities stored
1862        reversed in the BAM file. This method returns such reads' base qualities
1863        reversed back to their original orientation.
1864        """
1865        if self.is_reverse:
1866            return self.query_qualities[::-1]
1867        else:
1868            return self.query_qualities
1869
1870
1871    def get_aligned_pairs(self, matches_only=False, with_seq=False):
1872        """a list of aligned read (query) and reference positions.
1873
1874        For inserts, deletions, skipping either query or reference
1875        position may be None.
1876
1877        Padding is currently not supported and leads to an exception.
1878
1879        Parameters
1880        ----------
1881
1882        matches_only : bool
1883          If True, only matched bases are returned - no None on either
1884          side.
1885        with_seq : bool
1886          If True, return a third element in the tuple containing the
1887          reference sequence. Substitutions are lower-case. This option
1888          requires an MD tag to be present.
1889
1890        Returns
1891        -------
1892
1893        aligned_pairs : list of tuples
1894
1895        """
1896        cdef uint32_t k, i, pos, qpos, r_idx, l
1897        cdef int op
1898        cdef uint32_t * cigar_p
1899        cdef bam1_t * src = self._delegate
1900        cdef bint _matches_only = bool(matches_only)
1901        cdef bint _with_seq = bool(with_seq)
1902
1903        # TODO: this method performs no checking and assumes that
1904        # read sequence, cigar and MD tag are consistent.
1905
1906        if _with_seq:
1907            # force_str required for py2/py3 compatibility
1908            ref_seq = force_str(build_reference_sequence(src))
1909            if ref_seq is None:
1910                raise ValueError("MD tag not present")
1911
1912        r_idx = 0
1913
1914        if pysam_get_n_cigar(src) == 0:
1915            return []
1916
1917        result = []
1918        pos = src.core.pos
1919        qpos = 0
1920        cigar_p = pysam_bam_get_cigar(src)
1921        for k from 0 <= k < pysam_get_n_cigar(src):
1922            op = cigar_p[k] & BAM_CIGAR_MASK
1923            l = cigar_p[k] >> BAM_CIGAR_SHIFT
1924
1925            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
1926                if _with_seq:
1927                    for i from pos <= i < pos + l:
1928                        result.append((qpos, i, ref_seq[r_idx]))
1929                        r_idx += 1
1930                        qpos += 1
1931                else:
1932                    for i from pos <= i < pos + l:
1933                        result.append((qpos, i))
1934                        qpos += 1
1935                pos += l
1936
1937            elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
1938                if not _matches_only:
1939                    if _with_seq:
1940                        for i from pos <= i < pos + l:
1941                            result.append((qpos, None, None))
1942                            qpos += 1
1943                    else:
1944                        for i from pos <= i < pos + l:
1945                            result.append((qpos, None))
1946                            qpos += 1
1947                else:
1948                    qpos += l
1949
1950            elif op == BAM_CDEL:
1951                if not _matches_only:
1952                    if _with_seq:
1953                        for i from pos <= i < pos + l:
1954                            result.append((None, i, ref_seq[r_idx]))
1955                            r_idx += 1
1956                    else:
1957                        for i from pos <= i < pos + l:
1958                            result.append((None, i))
1959                else:
1960                    r_idx += l
1961                pos += l
1962
1963            elif op == BAM_CHARD_CLIP:
1964                pass # advances neither
1965
1966            elif op == BAM_CREF_SKIP:
1967                if not _matches_only:
1968                    if _with_seq:
1969                        for i from pos <= i < pos + l:
1970                            result.append((None, i, None))
1971                    else:
1972                        for i from pos <= i < pos + l:
1973                            result.append((None, i))
1974
1975                pos += l
1976
1977            elif op == BAM_CPAD:
1978                raise NotImplementedError(
1979                    "Padding (BAM_CPAD, 6) is currently not supported. "
1980                    "Please implement. Sorry about that.")
1981
1982        return result
1983
1984    def get_blocks(self):
1985        """ a list of start and end positions of
1986        aligned gapless blocks.
1987
1988        The start and end positions are in genomic
1989        coordinates.
1990
1991        Blocks are not normalized, i.e. two blocks
1992        might be directly adjacent. This happens if
1993        the two blocks are separated by an insertion
1994        in the read.
1995        """
1996
1997        cdef uint32_t k, pos, l
1998        cdef int op
1999        cdef uint32_t * cigar_p
2000        cdef bam1_t * src
2001
2002        src = self._delegate
2003        if pysam_get_n_cigar(src) == 0:
2004            return []
2005
2006        result = []
2007        pos = src.core.pos
2008        cigar_p = pysam_bam_get_cigar(src)
2009        l = 0
2010
2011        for k from 0 <= k < pysam_get_n_cigar(src):
2012            op = cigar_p[k] & BAM_CIGAR_MASK
2013            l = cigar_p[k] >> BAM_CIGAR_SHIFT
2014            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
2015                result.append((pos, pos + l))
2016                pos += l
2017            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
2018                pos += l
2019
2020        return result
2021
2022    def get_overlap(self, uint32_t start, uint32_t end):
2023        """return number of aligned bases of read overlapping the interval
2024        *start* and *end* on the reference sequence.
2025
2026        Return None if cigar alignment is not available.
2027        """
2028        cdef uint32_t k, i, pos, overlap
2029        cdef int op, o
2030        cdef uint32_t * cigar_p
2031        cdef bam1_t * src
2032
2033        overlap = 0
2034
2035        src = self._delegate
2036        if pysam_get_n_cigar(src) == 0:
2037            return None
2038        pos = src.core.pos
2039        o = 0
2040
2041        cigar_p = pysam_bam_get_cigar(src)
2042        for k from 0 <= k < pysam_get_n_cigar(src):
2043            op = cigar_p[k] & BAM_CIGAR_MASK
2044            l = cigar_p[k] >> BAM_CIGAR_SHIFT
2045
2046            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
2047                o = min( pos + l, end) - max( pos, start )
2048                if o > 0: overlap += o
2049
2050            if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
2051                pos += l
2052
2053        return overlap
2054
2055    def get_cigar_stats(self):
2056        """summary of operations in cigar string.
2057
2058        The output order in the array is "MIDNSHP=X" followed by a
2059        field for the NM tag. If the NM tag is not present, this
2060        field will always be 0.
2061
2062        +-----+--------------+-----+
2063        |M    |BAM_CMATCH    |0    |
2064        +-----+--------------+-----+
2065        |I    |BAM_CINS      |1    |
2066        +-----+--------------+-----+
2067        |D    |BAM_CDEL      |2    |
2068        +-----+--------------+-----+
2069        |N    |BAM_CREF_SKIP |3    |
2070        +-----+--------------+-----+
2071        |S    |BAM_CSOFT_CLIP|4    |
2072        +-----+--------------+-----+
2073        |H    |BAM_CHARD_CLIP|5    |
2074        +-----+--------------+-----+
2075        |P    |BAM_CPAD      |6    |
2076        +-----+--------------+-----+
2077        |=    |BAM_CEQUAL    |7    |
2078        +-----+--------------+-----+
2079        |X    |BAM_CDIFF     |8    |
2080        +-----+--------------+-----+
2081        |B    |BAM_CBACK     |9    |
2082        +-----+--------------+-----+
2083        |NM   |NM tag        |10   |
2084        +-----+--------------+-----+
2085
2086        If no cigar string is present, empty arrays will be returned.
2087
2088        Returns:
2089            arrays :
2090                two arrays. The first contains the nucleotide counts within
2091                each cigar operation, the second contains the number of blocks
2092                for each cigar operation.
2093
2094        """
2095
2096        cdef int nfields = NCIGAR_CODES + 1
2097
2098        cdef c_array.array base_counts = array.array(
2099            "I",
2100            [0] * nfields)
2101        cdef uint32_t [:] base_view = base_counts
2102        cdef c_array.array block_counts = array.array(
2103            "I",
2104            [0] * nfields)
2105        cdef uint32_t [:] block_view = block_counts
2106
2107        cdef bam1_t * src = self._delegate
2108        cdef int op
2109        cdef uint32_t l
2110        cdef int32_t k
2111        cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
2112
2113        if cigar_p == NULL:
2114            return None
2115
2116        for k from 0 <= k < pysam_get_n_cigar(src):
2117            op = cigar_p[k] & BAM_CIGAR_MASK
2118            l = cigar_p[k] >> BAM_CIGAR_SHIFT
2119            base_view[op] += l
2120            block_view[op] += 1
2121
2122        cdef uint8_t * v = bam_aux_get(src, 'NM')
2123        if v != NULL:
2124            base_view[nfields - 1] = <int32_t>bam_aux2i(v)
2125
2126        return base_counts, block_counts
2127
2128    #####################################################
2129    ## Unsorted as yet
2130    # TODO: capture in CIGAR object
2131    property cigartuples:
2132        """the :term:`cigar` alignment. The alignment
2133        is returned as a list of tuples of (operation, length).
2134
2135        If the alignment is not present, None is returned.
2136
2137        The operations are:
2138
2139        +-----+--------------+-----+
2140        |M    |BAM_CMATCH    |0    |
2141        +-----+--------------+-----+
2142        |I    |BAM_CINS      |1    |
2143        +-----+--------------+-----+
2144        |D    |BAM_CDEL      |2    |
2145        +-----+--------------+-----+
2146        |N    |BAM_CREF_SKIP |3    |
2147        +-----+--------------+-----+
2148        |S    |BAM_CSOFT_CLIP|4    |
2149        +-----+--------------+-----+
2150        |H    |BAM_CHARD_CLIP|5    |
2151        +-----+--------------+-----+
2152        |P    |BAM_CPAD      |6    |
2153        +-----+--------------+-----+
2154        |=    |BAM_CEQUAL    |7    |
2155        +-----+--------------+-----+
2156        |X    |BAM_CDIFF     |8    |
2157        +-----+--------------+-----+
2158        |B    |BAM_CBACK     |9    |
2159        +-----+--------------+-----+
2160
2161        .. note::
2162            The output is a list of (operation, length) tuples, such as
2163            ``[(0, 30)]``.
2164            This is different from the SAM specification and
2165            the :attr:`cigarstring` property, which uses a
2166            (length, operation) order, for example: ``30M``.
2167
2168        To unset the cigar property, assign an empty list
2169        or None.
2170        """
2171        def __get__(self):
2172            cdef uint32_t * cigar_p
2173            cdef bam1_t * src
2174            cdef uint32_t op, l
2175            cdef uint32_t k
2176
2177            src = self._delegate
2178            if pysam_get_n_cigar(src) == 0:
2179                return None
2180
2181            cigar = []
2182
2183            cigar_p = pysam_bam_get_cigar(src);
2184            for k from 0 <= k < pysam_get_n_cigar(src):
2185                op = cigar_p[k] & BAM_CIGAR_MASK
2186                l = cigar_p[k] >> BAM_CIGAR_SHIFT
2187                cigar.append((op, l))
2188            return cigar
2189
2190        def __set__(self, values):
2191            cdef uint32_t * p
2192            cdef bam1_t * src
2193            cdef op, l
2194            cdef int k
2195
2196            k = 0
2197
2198            src = self._delegate
2199
2200            # get location of cigar string
2201            p = pysam_bam_get_cigar(src)
2202
2203            # empty values for cigar string
2204            if values is None:
2205                values = []
2206
2207            cdef uint32_t ncigar = len(values)
2208
2209            cdef bam1_t * retval = pysam_bam_update(src,
2210                                                    pysam_get_n_cigar(src) * 4,
2211                                                    ncigar * 4,
2212                                                    <uint8_t*>p)
2213
2214            if retval == NULL:
2215                raise MemoryError("could not allocate memory")
2216
2217            # length is number of cigar operations, not bytes
2218            pysam_set_n_cigar(src, ncigar)
2219
2220            # re-acquire pointer to location in memory
2221            # as it might have moved
2222            p = pysam_bam_get_cigar(src)
2223
2224            # insert cigar operations
2225            for op, l in values:
2226                p[k] = l << BAM_CIGAR_SHIFT | op
2227                k += 1
2228
2229            ## setting the cigar string requires updating the bin
2230            update_bin(src)
2231
2232    cpdef set_tag(self,
2233                  tag,
2234                  value,
2235                  value_type=None,
2236                  replace=True):
2237        """sets a particular field *tag* to *value* in the optional alignment
2238        section.
2239
2240        *value_type* describes the type of *value* that is to entered
2241        into the alignment record. It can be set explicitly to one of
2242        the valid one-letter type codes. If unset, an appropriate type
2243        will be chosen automatically based on the python type of
2244        *value*.
2245
2246        An existing value of the same *tag* will be overwritten unless
2247        *replace* is set to False. This is usually not recommended as a
2248        tag may only appear once in the optional alignment section.
2249
2250        If *value* is None, the tag will be deleted.
2251
2252        This method accepts valid SAM specification value types, which
2253        are::
2254
2255           A: printable char
2256           i: signed int
2257           f: float
2258           Z: printable string
2259           H: Byte array in hex format
2260           B: Integer or numeric array
2261
2262        Additionally, it will accept the integer BAM types ('cCsSI')
2263
2264        For htslib compatibility, 'a' is synonymous with 'A' and the
2265        method accepts a 'd' type code for a double precision float.
2266
2267        When deducing the type code by the python type of *value*, the
2268        following mapping is applied::
2269
2270            i: python int
2271            f: python float
2272            Z: python str or bytes
2273            B: python array.array, list or tuple
2274
2275        Note that a single character string will be output as 'Z' and
2276        not 'A' as the former is the more general type.
2277        """
2278
2279        cdef int value_size
2280        cdef uint8_t tc
2281        cdef uint8_t * value_ptr
2282        cdef uint8_t *existing_ptr
2283        cdef float float_value
2284        cdef double double_value
2285        cdef int32_t int32_t_value
2286        cdef uint32_t uint32_t_value
2287        cdef int16_t int16_t_value
2288        cdef uint16_t uint16_t_value
2289        cdef int8_t int8_t_value
2290        cdef uint8_t uint8_t_value
2291        cdef bam1_t * src = self._delegate
2292        cdef char * _value_type
2293        cdef c_array.array array_value
2294        cdef object buffer
2295
2296        if len(tag) != 2:
2297            raise ValueError('Invalid tag: %s' % tag)
2298
2299        tag = force_bytes(tag)
2300        if replace:
2301            existing_ptr = bam_aux_get(src, tag)
2302            if existing_ptr:
2303                bam_aux_del(src, existing_ptr)
2304
2305        # setting value to None deletes a tag
2306        if value is None:
2307            return
2308
2309        cdef uint8_t typecode = get_tag_typecode(value, value_type)
2310        if typecode == 0:
2311            raise ValueError("can't guess type or invalid type code specified: {} {}".format(
2312                value, value_type))
2313
2314        # sam_format1 for typecasting
2315        if typecode == 'Z':
2316            value = force_bytes(value)
2317            value_ptr = <uint8_t*><char*>value
2318            value_size = len(value)+1
2319        elif typecode == 'H':
2320            # Note that hex tags are stored the very same
2321            # way as Z string.s
2322            value = force_bytes(value)
2323            value_ptr = <uint8_t*><char*>value
2324            value_size = len(value)+1
2325        elif typecode == 'A' or typecode == 'a':
2326            value = force_bytes(value)
2327            value_ptr = <uint8_t*><char*>value
2328            value_size = sizeof(char)
2329            typecode = 'A'
2330        elif typecode == 'i':
2331            int32_t_value = value
2332            value_ptr = <uint8_t*>&int32_t_value
2333            value_size = sizeof(int32_t)
2334        elif typecode == 'I':
2335            uint32_t_value = value
2336            value_ptr = <uint8_t*>&uint32_t_value
2337            value_size = sizeof(uint32_t)
2338        elif typecode == 's':
2339            int16_t_value = value
2340            value_ptr = <uint8_t*>&int16_t_value
2341            value_size = sizeof(int16_t)
2342        elif typecode == 'S':
2343            uint16_t_value = value
2344            value_ptr = <uint8_t*>&uint16_t_value
2345            value_size = sizeof(uint16_t)
2346        elif typecode == 'c':
2347            int8_t_value = value
2348            value_ptr = <uint8_t*>&int8_t_value
2349            value_size = sizeof(int8_t)
2350        elif typecode == 'C':
2351            uint8_t_value = value
2352            value_ptr = <uint8_t*>&uint8_t_value
2353            value_size = sizeof(uint8_t)
2354        elif typecode == 'd':
2355            double_value = value
2356            value_ptr = <uint8_t*>&double_value
2357            value_size = sizeof(double)
2358        elif typecode == 'f':
2359            float_value  = value
2360            value_ptr = <uint8_t*>&float_value
2361            value_size = sizeof(float)
2362        elif typecode == 'B':
2363            # the following goes through python, needs to be cleaned up
2364            # pack array using struct
2365            fmt, args = pack_tags([(tag, value, value_type)])
2366
2367            # remove tag and type code as set by bam_aux_append
2368            # first four chars of format (<2sB)
2369            fmt = '<' + fmt[4:]
2370            # first two values to pack
2371            args = args[2:]
2372            value_size = struct.calcsize(fmt)
2373            # buffer will be freed when object goes out of scope
2374            buffer = ctypes.create_string_buffer(value_size)
2375            struct.pack_into(fmt, buffer, 0, *args)
2376            # bam_aux_append copies data from value_ptr
2377            bam_aux_append(src,
2378                           tag,
2379                           typecode,
2380                           value_size,
2381                           <uint8_t*>buffer.raw)
2382            return
2383        else:
2384            raise ValueError('unsupported value_type {} in set_option'.format(typecode))
2385
2386        bam_aux_append(src,
2387                       tag,
2388                       typecode,
2389                       value_size,
2390                       value_ptr)
2391
2392    cpdef has_tag(self, tag):
2393        """returns true if the optional alignment section
2394        contains a given *tag*."""
2395        cdef uint8_t * v
2396        cdef int nvalues
2397        btag = force_bytes(tag)
2398        v = bam_aux_get(self._delegate, btag)
2399        return v != NULL
2400
2401    cpdef get_tag(self, tag, with_value_type=False):
2402        """
2403        retrieves data from the optional alignment section
2404        given a two-letter *tag* denoting the field.
2405
2406        The returned value is cast into an appropriate python type.
2407
2408        This method is the fastest way to access the optional
2409        alignment section if only few tags need to be retrieved.
2410
2411        Possible value types are "AcCsSiIfZHB" (see BAM format
2412        specification) as well as additional value type 'd' as
2413        implemented in htslib.
2414
2415        Parameters:
2416
2417            tag :
2418                data tag.
2419
2420            with_value_type : Optional[bool]
2421                if set to True, the return value is a tuple of (tag value, type
2422                code). (default False)
2423
2424        Returns:
2425
2426            A python object with the value of the `tag`. The type of the
2427            object depends on the data type in the data record.
2428
2429        Raises:
2430
2431            KeyError
2432                If `tag` is not present, a KeyError is raised.
2433
2434        """
2435        cdef uint8_t * v
2436        cdef int nvalues
2437        btag = force_bytes(tag)
2438        v = bam_aux_get(self._delegate, btag)
2439        if v == NULL:
2440            raise KeyError("tag '%s' not present" % tag)
2441        if chr(v[0]) == "B":
2442            auxtype = chr(v[0]) + chr(v[1])
2443        else:
2444            auxtype = chr(v[0])
2445
2446        if auxtype in "iIcCsS":
2447            value = bam_aux2i(v)
2448        elif auxtype == 'f' or auxtype == 'F':
2449            value = bam_aux2f(v)
2450        elif auxtype == 'd' or auxtype == 'D':
2451            value = bam_aux2f(v)
2452        elif auxtype == 'A' or auxtype == 'a':
2453            # force A to a
2454            v[0] = 'A'
2455            # there might a more efficient way
2456            # to convert a char into a string
2457            value = '%c' % <char>bam_aux2A(v)
2458        elif auxtype == 'Z' or auxtype == 'H':
2459            # Z and H are treated equally as strings in htslib
2460            value = charptr_to_str(<char*>bam_aux2Z(v))
2461        elif auxtype[0] == 'B':
2462            bytesize, nvalues, values = convert_binary_tag(v + 1)
2463            value = values
2464        else:
2465            raise ValueError("unknown auxiliary type '%s'" % auxtype)
2466
2467        if with_value_type:
2468            return (value, auxtype)
2469        else:
2470            return value
2471
2472    def get_tags(self, with_value_type=False):
2473        """the fields in the optional alignment section.
2474
2475        Returns a list of all fields in the optional
2476        alignment section. Values are converted to appropriate python
2477        values. For example:
2478
2479        [(NM, 2), (RG, "GJP00TM04")]
2480
2481        If *with_value_type* is set, the value type as encode in
2482        the AlignedSegment record will be returned as well:
2483
2484        [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
2485
2486        This method will convert all values in the optional alignment
2487        section. When getting only one or few tags, please see
2488        :meth:`get_tag` for a quicker way to achieve this.
2489
2490        """
2491
2492        cdef char * ctag
2493        cdef bam1_t * src
2494        cdef uint8_t * s
2495        cdef char auxtag[3]
2496        cdef char auxtype
2497        cdef uint8_t byte_size
2498        cdef int32_t nvalues
2499
2500        src = self._delegate
2501        if src.l_data == 0:
2502            return []
2503        s = pysam_bam_get_aux(src)
2504        result = []
2505        auxtag[2] = 0
2506        while s < (src.data + src.l_data):
2507            # get tag
2508            auxtag[0] = s[0]
2509            auxtag[1] = s[1]
2510            s += 2
2511            auxtype = s[0]
2512            if auxtype in ('c', 'C'):
2513                value = <int>bam_aux2i(s)
2514                s += 1
2515            elif auxtype in ('s', 'S'):
2516                value = <int>bam_aux2i(s)
2517                s += 2
2518            elif auxtype in ('i', 'I'):
2519                value = <int32_t>bam_aux2i(s)
2520                s += 4
2521            elif auxtype == 'f':
2522                value = <float>bam_aux2f(s)
2523                s += 4
2524            elif auxtype == 'd':
2525                value = <double>bam_aux2f(s)
2526                s += 8
2527            elif auxtype in ('A', 'a'):
2528                value = "%c" % <char>bam_aux2A(s)
2529                s += 1
2530            elif auxtype in ('Z', 'H'):
2531                value = charptr_to_str(<char*>bam_aux2Z(s))
2532                # +1 for NULL terminated string
2533                s += len(value) + 1
2534            elif auxtype == 'B':
2535                s += 1
2536                byte_size, nvalues, value = convert_binary_tag(s)
2537                # 5 for 1 char and 1 int
2538                s += 5 + (nvalues * byte_size) - 1
2539            else:
2540                raise KeyError("unknown type '%s'" % auxtype)
2541
2542            s += 1
2543
2544            if with_value_type:
2545                result.append((charptr_to_str(auxtag), value, chr(auxtype)))
2546            else:
2547                result.append((charptr_to_str(auxtag), value))
2548
2549        return result
2550
2551    def set_tags(self, tags):
2552        """sets the fields in the optional alignment section with
2553        a list of (tag, value) tuples.
2554
2555        The :term:`value type` of the values is determined from the
2556        python type. Optionally, a type may be given explicitly as
2557        a third value in the tuple, For example:
2558
2559        x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
2560
2561        This method will not enforce the rule that the same tag may appear
2562        only once in the optional alignment section.
2563        """
2564
2565        cdef bam1_t * src
2566        cdef uint8_t * s
2567        cdef char * temp
2568        cdef int new_size = 0
2569        cdef int old_size
2570        src = self._delegate
2571
2572        # convert and pack the data
2573        if tags is not None and len(tags) > 0:
2574            fmt, args = pack_tags(tags)
2575            new_size = struct.calcsize(fmt)
2576            buffer = ctypes.create_string_buffer(new_size)
2577            struct.pack_into(fmt,
2578                             buffer,
2579                             0,
2580                             *args)
2581
2582
2583        # delete the old data and allocate new space.
2584        # If total_size == 0, the aux field will be
2585        # empty
2586        old_size = pysam_bam_get_l_aux(src)
2587        cdef bam1_t * retval = pysam_bam_update(src,
2588                                                old_size,
2589                                                new_size,
2590                                                pysam_bam_get_aux(src))
2591        if retval == NULL:
2592            raise MemoryError("could not allocated memory")
2593
2594        # copy data only if there is any
2595        if new_size > 0:
2596
2597            # get location of new data
2598            s = pysam_bam_get_aux(src)
2599
2600            # check if there is direct path from buffer.raw to tmp
2601            p = buffer.raw
2602            # create handle to make sure buffer stays alive long
2603            # enough for memcpy, see issue 129
2604            temp = p
2605            memcpy(s, temp, new_size)
2606
2607
2608    ########################################################
2609    # Compatibility Accessors
2610    # Functions, properties for compatibility with pysam < 0.8
2611    #
2612    # Several options
2613    #     change the factory functions according to API
2614    #         * requires code changes throughout, incl passing
2615    #           handles to factory functions
2616    #     subclass functions and add attributes at runtime
2617    #         e.g.: AlignedSegments.qname = AlignedSegments.query_name
2618    #         * will slow down the default interface
2619    #     explicit declaration of getters/setters
2620    ########################################################
2621    property qname:
2622        """deprecated, use query_name instead"""
2623        def __get__(self): return self.query_name
2624        def __set__(self, v): self.query_name = v
2625    property tid:
2626        """deprecated, use reference_id instead"""
2627        def __get__(self): return self.reference_id
2628        def __set__(self, v): self.reference_id = v
2629    property pos:
2630        """deprecated, use reference_start instead"""
2631        def __get__(self): return self.reference_start
2632        def __set__(self, v): self.reference_start = v
2633    property mapq:
2634        """deprecated, use mapping_quality instead"""
2635        def __get__(self): return self.mapping_quality
2636        def __set__(self, v): self.mapping_quality = v
2637    property rnext:
2638        """deprecated, use next_reference_id instead"""
2639        def __get__(self): return self.next_reference_id
2640        def __set__(self, v): self.next_reference_id = v
2641    property pnext:
2642        """deprecated, use next_reference_start instead"""
2643        def __get__(self):
2644            return self.next_reference_start
2645        def __set__(self, v):
2646            self.next_reference_start = v
2647    property cigar:
2648        """deprecated, use cigartuples instead"""
2649        def __get__(self):
2650            r = self.cigartuples
2651            if r is None:
2652                r = []
2653            return r
2654        def __set__(self, v): self.cigartuples = v
2655    property tlen:
2656        """deprecated, use template_length instead"""
2657        def __get__(self):
2658            return self.template_length
2659        def __set__(self, v):
2660            self.template_length = v
2661    property seq:
2662        """deprecated, use query_sequence instead"""
2663        def __get__(self):
2664            return self.query_sequence
2665        def __set__(self, v):
2666            self.query_sequence = v
2667    property qual:
2668        """deprecated, query_qualities instead"""
2669        def __get__(self):
2670            return array_to_qualitystring(self.query_qualities)
2671        def __set__(self, v):
2672            self.query_qualities = qualitystring_to_array(v)
2673    property alen:
2674        """deprecated, reference_length instead"""
2675        def __get__(self):
2676            return self.reference_length
2677        def __set__(self, v):
2678            self.reference_length = v
2679    property aend:
2680        """deprecated, reference_end instead"""
2681        def __get__(self):
2682            return self.reference_end
2683        def __set__(self, v):
2684            self.reference_end = v
2685    property rlen:
2686        """deprecated, query_length instead"""
2687        def __get__(self):
2688            return self.query_length
2689        def __set__(self, v):
2690            self.query_length = v
2691    property query:
2692        """deprecated, query_alignment_sequence instead"""
2693        def __get__(self):
2694            return self.query_alignment_sequence
2695        def __set__(self, v):
2696            self.query_alignment_sequence = v
2697    property qqual:
2698        """deprecated, query_alignment_qualities instead"""
2699        def __get__(self):
2700            return array_to_qualitystring(self.query_alignment_qualities)
2701        def __set__(self, v):
2702            self.query_alignment_qualities = qualitystring_to_array(v)
2703    property qstart:
2704        """deprecated, use query_alignment_start instead"""
2705        def __get__(self):
2706            return self.query_alignment_start
2707        def __set__(self, v):
2708            self.query_alignment_start = v
2709    property qend:
2710        """deprecated, use query_alignment_end instead"""
2711        def __get__(self):
2712            return self.query_alignment_end
2713        def __set__(self, v):
2714            self.query_alignment_end = v
2715    property qlen:
2716        """deprecated, use query_alignment_length instead"""
2717        def __get__(self):
2718            return self.query_alignment_length
2719        def __set__(self, v):
2720            self.query_alignment_length = v
2721    property mrnm:
2722        """deprecated, use next_reference_id instead"""
2723        def __get__(self):
2724            return self.next_reference_id
2725        def __set__(self, v):
2726            self.next_reference_id = v
2727    property mpos:
2728        """deprecated, use next_reference_start instead"""
2729        def __get__(self):
2730            return self.next_reference_start
2731        def __set__(self, v):
2732            self.next_reference_start = v
2733    property rname:
2734        """deprecated, use reference_id instead"""
2735        def __get__(self):
2736            return self.reference_id
2737        def __set__(self, v):
2738            self.reference_id = v
2739    property isize:
2740        """deprecated, use template_length instead"""
2741        def __get__(self):
2742            return self.template_length
2743        def __set__(self, v):
2744            self.template_length = v
2745    property blocks:
2746        """deprecated, use get_blocks() instead"""
2747        def __get__(self):
2748            return self.get_blocks()
2749    property aligned_pairs:
2750        """deprecated, use get_aligned_pairs() instead"""
2751        def __get__(self):
2752            return self.get_aligned_pairs()
2753    property inferred_length:
2754        """deprecated, use infer_query_length() instead"""
2755        def __get__(self):
2756            return self.infer_query_length()
2757    property positions:
2758        """deprecated, use get_reference_positions() instead"""
2759        def __get__(self):
2760            return self.get_reference_positions()
2761    property tags:
2762        """deprecated, use get_tags() instead"""
2763        def __get__(self):
2764            return self.get_tags()
2765        def __set__(self, tags):
2766            self.set_tags(tags)
2767    def overlap(self):
2768        """deprecated, use get_overlap() instead"""
2769        return self.get_overlap()
2770    def opt(self, tag):
2771        """deprecated, use get_tag() instead"""
2772        return self.get_tag(tag)
2773    def setTag(self, tag, value, value_type=None, replace=True):
2774        """deprecated, use set_tag() instead"""
2775        return self.set_tag(tag, value, value_type, replace)
2776
2777
2778cdef class PileupColumn:
2779    '''A pileup of reads at a particular reference sequence position
2780    (:term:`column`). A pileup column contains all the reads that map
2781    to a certain target base.
2782
2783    This class is a proxy for results returned by the samtools pileup
2784    engine.  If the underlying engine iterator advances, the results
2785    of this column will change.
2786    '''
2787    def __init__(self):
2788        raise TypeError("this class cannot be instantiated from Python")
2789
2790    def __str__(self):
2791        return "\t".join(map(str,
2792                              (self.reference_id,
2793                               self.reference_pos,
2794                               self.nsegments))) +\
2795            "\n" +\
2796            "\n".join(map(str, self.pileups))
2797
2798    def __dealloc__(self):
2799        free(self.buf.s)
2800
2801    def set_min_base_quality(self, min_base_quality):
2802        """set the minimum base quality for this pileup column.
2803        """
2804        self.min_base_quality = min_base_quality
2805
2806    def __len__(self):
2807        """return number of reads aligned to this column.
2808
2809        see :meth:`get_num_aligned`
2810        """
2811        return self.get_num_aligned()
2812
2813    property reference_id:
2814        '''the reference sequence number as defined in the header'''
2815        def __get__(self):
2816            return self.tid
2817
2818    property reference_name:
2819        """:term:`reference` name (None if no AlignmentFile is associated)"""
2820        def __get__(self):
2821            if self.header is not None:
2822                return self.header.get_reference_name(self.tid)
2823            return None
2824
2825    property nsegments:
2826        '''number of reads mapping to this column.
2827
2828        Note that this number ignores the base quality filter.'''
2829        def __get__(self):
2830            return self.n_pu
2831        def __set__(self, n):
2832            self.n_pu = n
2833
2834    property reference_pos:
2835        '''the position in the reference sequence (0-based).'''
2836        def __get__(self):
2837            return self.pos
2838
2839    property pileups:
2840        '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2841        def __get__(self):
2842            if self.plp == NULL or self.plp[0] == NULL:
2843                raise ValueError("PileupColumn accessed after iterator finished")
2844
2845            cdef int x
2846            cdef const bam_pileup1_t * p = NULL
2847            pileups = []
2848
2849            # warning: there could be problems if self.n and self.buf are
2850            # out of sync.
2851            for x from 0 <= x < self.n_pu:
2852                p = &(self.plp[0][x])
2853                if p == NULL:
2854                    raise ValueError(
2855                        "pileup buffer out of sync - most likely use of iterator "
2856                        "outside loop")
2857                if pileup_base_qual_skip(p, self.min_base_quality):
2858                    continue
2859                pileups.append(makePileupRead(p, self.header))
2860            return pileups
2861
2862    ########################################################
2863    # Compatibility Accessors
2864    # Functions, properties for compatibility with pysam < 0.8
2865    ########################################################
2866    property pos:
2867        """deprecated: use reference_pos"""
2868        def __get__(self):
2869            return self.reference_pos
2870        def __set__(self, v):
2871            self.reference_pos = v
2872
2873    property tid:
2874        """deprecated: use reference_id"""
2875        def __get__(self):
2876            return self.reference_id
2877        def __set__(self, v):
2878            self.reference_id = v
2879
2880    property n:
2881        """deprecated: use nsegments"""
2882        def __get__(self):
2883            return self.nsegments
2884        def __set__(self, v):
2885            self.nsegments = v
2886
2887    def get_num_aligned(self):
2888        """return number of aligned bases at pileup column position.
2889
2890        This method applies a base quality filter and the number is
2891        equal to the size of :meth:`get_query_sequences`,
2892        :meth:`get_mapping_qualities`, etc.
2893
2894        """
2895        cdef uint32_t x = 0
2896        cdef uint32_t c = 0
2897        cdef uint32_t cnt = 0
2898        cdef const bam_pileup1_t * p = NULL
2899        if self.plp == NULL or self.plp[0] == NULL:
2900            raise ValueError("PileupColumn accessed after iterator finished")
2901
2902        for x from 0 <= x < self.n_pu:
2903            p = &(self.plp[0][x])
2904            if p == NULL:
2905                raise ValueError(
2906                    "pileup buffer out of sync - most likely use of iterator "
2907                    "outside loop")
2908            if pileup_base_qual_skip(p, self.min_base_quality):
2909                continue
2910            cnt += 1
2911        return cnt
2912
2913    def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
2914        """query bases/sequences at pileup column position.
2915
2916        Optionally, the bases/sequences can be annotated according to the samtools
2917        mpileup format. This is the format description from the samtools mpileup tool::
2918
2919           Information on match, mismatch, indel, strand, mapping
2920           quality and start and end of a read are all encoded at the
2921           read base column. At this column, a dot stands for a match
2922           to the reference base on the forward strand, a comma for a
2923           match on the reverse strand, a '>' or '<' for a reference
2924           skip, `ACGTN' for a mismatch on the forward strand and
2925           `acgtn' for a mismatch on the reverse strand. A pattern
2926           `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
2927           between this reference position and the next reference
2928           position. The length of the insertion is given by the
2929           integer in the pattern, followed by the inserted
2930           sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
2931           represents a deletion from the reference. The deleted bases
2932           will be presented as `*' in the following lines. Also at
2933           the read base column, a symbol `^' marks the start of a
2934           read. The ASCII of the character following `^' minus 33
2935           gives the mapping quality. A symbol `$' marks the end of a
2936           read segment
2937
2938        To reproduce samtools mpileup format, set all of mark_matches,
2939        mark_ends and add_indels to True.
2940
2941        Parameters
2942        ----------
2943
2944        mark_matches: bool
2945
2946          If True, output bases matching the reference as "." or ","
2947          for forward and reverse strand, respectively. This mark
2948          requires the reference sequence. If no reference is
2949          present, this option is ignored.
2950
2951        mark_ends : bool
2952
2953          If True, add markers "^" and "$" for read start and end, respectively.
2954
2955        add_indels : bool
2956
2957          If True, add bases for bases inserted into or skipped from the
2958          reference. The latter requires a reference sequence file to have
2959          been given, e.g. via `pileup(fastafile = ...)`. If no reference
2960          sequence is available, skipped bases are represented as 'N's.
2961
2962        Returns
2963        -------
2964
2965        list: a list of bases/sequences per read at pileup column position.
2966
2967        """
2968        cdef uint32_t x = 0
2969        cdef uint32_t j = 0
2970        cdef uint32_t c = 0
2971        cdef uint8_t cc = 0
2972        cdef uint8_t rb = 0
2973        cdef kstring_t * buf = &self.buf
2974        cdef const bam_pileup1_t * p = NULL
2975
2976        if self.plp == NULL or self.plp[0] == NULL:
2977            raise ValueError("PileupColumn accessed after iterator finished")
2978
2979        buf.l = 0
2980
2981        # todo: reference sequence to count matches/mismatches
2982        # todo: convert assertions to exceptions
2983        for x from 0 <= x < self.n_pu:
2984            p = &(self.plp[0][x])
2985            if p == NULL:
2986                raise ValueError(
2987                    "pileup buffer out of sync - most likely use of iterator "
2988                    "outside loop")
2989            if pileup_base_qual_skip(p, self.min_base_quality):
2990                continue
2991            # see samtools pileup_seq
2992            if mark_ends and p.is_head:
2993                kputc('^', buf)
2994
2995                if p.b.core.qual > 93:
2996                    kputc(126, buf)
2997                else:
2998                    kputc(p.b.core.qual + 33, buf)
2999            if not p.is_del:
3000                if p.qpos < p.b.core.l_qseq:
3001                    cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
3002                else:
3003                    cc = 'N'
3004
3005                if mark_matches and self.reference_sequence != NULL:
3006                    rb = self.reference_sequence[self.reference_pos]
3007                    if seq_nt16_table[cc] == seq_nt16_table[rb]:
3008                        cc = "="
3009                kputc(strand_mark_char(cc, p.b), buf)
3010            elif add_indels:
3011                if p.is_refskip:
3012                    if bam_is_rev(p.b):
3013                        kputc('<', buf)
3014                    else:
3015                        kputc('>', buf)
3016                else:
3017                    kputc('*', buf)
3018            if add_indels:
3019                if p.indel > 0:
3020                    kputc('+', buf)
3021                    kputw(p.indel, buf)
3022                    for j from 1 <= j <= p.indel:
3023                        cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
3024                        kputc(strand_mark_char(cc, p.b), buf)
3025                elif p.indel < 0:
3026                    kputc('-', buf)
3027                    kputw(-p.indel, buf)
3028                    for j from 1 <= j <= -p.indel:
3029                        # TODO: out-of-range check here?
3030                        if self.reference_sequence == NULL:
3031                            cc = 'N'
3032                        else:
3033                            cc = self.reference_sequence[self.reference_pos + j]
3034                        kputc(strand_mark_char(cc, p.b), buf)
3035            if mark_ends and p.is_tail:
3036                kputc('$', buf)
3037
3038            kputc(':', buf)
3039
3040        if buf.l == 0:
3041            # could be zero if all qualities are too low
3042            return ""
3043        else:
3044            # quicker to ensemble all and split than to encode all separately.
3045            # ignore last ":"
3046            return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
3047
3048    def get_query_qualities(self):
3049        """query base quality scores at pileup column position.
3050
3051        Returns
3052        -------
3053
3054        list: a list of quality scores
3055        """
3056        cdef uint32_t x = 0
3057        cdef const bam_pileup1_t * p = NULL
3058        cdef uint32_t c = 0
3059        result = []
3060        for x from 0 <= x < self.n_pu:
3061            p = &(self.plp[0][x])
3062            if p == NULL:
3063                raise ValueError(
3064                    "pileup buffer out of sync - most likely use of iterator "
3065                    "outside loop")
3066
3067            if p.qpos < p.b.core.l_qseq:
3068                c = bam_get_qual(p.b)[p.qpos]
3069            else:
3070                c = 0
3071            if c < self.min_base_quality:
3072                continue
3073            result.append(c)
3074        return result
3075
3076    def get_mapping_qualities(self):
3077        """query mapping quality scores at pileup column position.
3078
3079        Returns
3080        -------
3081
3082        list: a list of quality scores
3083        """
3084        if self.plp == NULL or self.plp[0] == NULL:
3085            raise ValueError("PileupColumn accessed after iterator finished")
3086
3087        cdef uint32_t x = 0
3088        cdef const bam_pileup1_t * p = NULL
3089        result = []
3090        for x from 0 <= x < self.n_pu:
3091            p = &(self.plp[0][x])
3092            if p == NULL:
3093                raise ValueError(
3094                    "pileup buffer out of sync - most likely use of iterator "
3095                    "outside loop")
3096
3097            if pileup_base_qual_skip(p, self.min_base_quality):
3098                continue
3099            result.append(p.b.core.qual)
3100        return result
3101
3102    def get_query_positions(self):
3103        """positions in read at pileup column position.
3104
3105        Returns
3106        -------
3107
3108        list: a list of read positions
3109        """
3110        if self.plp == NULL or self.plp[0] == NULL:
3111            raise ValueError("PileupColumn accessed after iterator finished")
3112
3113        cdef uint32_t x = 0
3114        cdef const bam_pileup1_t * p = NULL
3115        result = []
3116        for x from 0 <= x < self.n_pu:
3117            p = &(self.plp[0][x])
3118            if p == NULL:
3119                raise ValueError(
3120                    "pileup buffer out of sync - most likely use of iterator "
3121                    "outside loop")
3122
3123            if pileup_base_qual_skip(p, self.min_base_quality):
3124                continue
3125            result.append(p.qpos)
3126        return result
3127
3128    def get_query_names(self):
3129        """query/read names aligned at pileup column position.
3130
3131        Returns
3132        -------
3133
3134        list: a list of query names at pileup column position.
3135        """
3136        if self.plp == NULL or self.plp[0] == NULL:
3137            raise ValueError("PileupColumn accessed after iterator finished")
3138
3139        cdef uint32_t x = 0
3140        cdef const bam_pileup1_t * p = NULL
3141        result = []
3142        for x from 0 <= x < self.n_pu:
3143            p = &(self.plp[0][x])
3144            if p == NULL:
3145                raise ValueError(
3146                    "pileup buffer out of sync - most likely use of iterator "
3147                    "outside loop")
3148
3149            if pileup_base_qual_skip(p, self.min_base_quality):
3150                continue
3151            result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
3152        return result
3153
3154
3155cdef class PileupRead:
3156    '''Representation of a read aligned to a particular position in the
3157    reference sequence.
3158
3159    '''
3160
3161    def __init__(self):
3162        raise TypeError(
3163            "this class cannot be instantiated from Python")
3164
3165    def __str__(self):
3166        return "\t".join(
3167            map(str,
3168                (self.alignment, self.query_position,
3169                 self.indel, self.level,
3170                 self.is_del, self.is_head,
3171                 self.is_tail, self.is_refskip)))
3172
3173    property alignment:
3174        """a :class:`pysam.AlignedSegment` object of the aligned read"""
3175        def __get__(self):
3176            return self._alignment
3177
3178    property query_position:
3179        """position of the read base at the pileup site, 0-based.
3180        None if is_del or is_refskip is set.
3181
3182        """
3183        def __get__(self):
3184            if self.is_del or self.is_refskip:
3185                return None
3186            else:
3187                return self._qpos
3188
3189    property query_position_or_next:
3190        """position of the read base at the pileup site, 0-based.
3191
3192        If the current position is a deletion, returns the next
3193        aligned base.
3194
3195        """
3196        def __get__(self):
3197            return self._qpos
3198
3199    property indel:
3200        """indel length for the position following the current pileup site.
3201
3202        This quantity peeks ahead to the next cigar operation in this
3203        alignment. If the next operation is an insertion, indel will
3204        be positive. If the next operation is a deletion, it will be
3205        negation. 0 if the next operation is not an indel.
3206
3207        """
3208        def __get__(self):
3209            return self._indel
3210
3211    property level:
3212        """the level of the read in the "viewer" mode. Note that this value
3213        is currently not computed."""
3214        def __get__(self):
3215            return self._level
3216
3217    property is_del:
3218        """1 iff the base on the padded read is a deletion"""
3219        def __get__(self):
3220            return self._is_del
3221
3222    property is_head:
3223        """1 iff the base on the padded read is the left-most base."""
3224        def __get__(self):
3225            return self._is_head
3226
3227    property is_tail:
3228        """1 iff the base on the padded read is the right-most base."""
3229        def __get__(self):
3230            return self._is_tail
3231
3232    property is_refskip:
3233        """1 iff the base on the padded read is part of CIGAR N op."""
3234        def __get__(self):
3235            return self._is_refskip
3236
3237
3238
3239cpdef enum CIGAR_OPS:
3240    CMATCH = 0
3241    CINS = 1
3242    CDEL = 2
3243    CREF_SKIP = 3
3244    CSOFT_CLIP = 4
3245    CHARD_CLIP = 5
3246    CPAD = 6
3247    CEQUAL = 7
3248    CDIFF = 8
3249    CBACK = 9
3250
3251
3252cpdef enum SAM_FLAGS:
3253    # the read is paired in sequencing, no matter whether it is mapped in a pair
3254    FPAIRED = 1
3255    # the read is mapped in a proper pair
3256    FPROPER_PAIR = 2
3257    # the read itself is unmapped; conflictive with FPROPER_PAIR
3258    FUNMAP = 4
3259    # the mate is unmapped
3260    FMUNMAP = 8
3261    # the read is mapped to the reverse strand
3262    FREVERSE = 16
3263    # the mate is mapped to the reverse strand
3264    FMREVERSE = 32
3265    # this is read1
3266    FREAD1 = 64
3267    # this is read2
3268    FREAD2 = 128
3269    # not primary alignment
3270    FSECONDARY = 256
3271    # QC failure
3272    FQCFAIL = 512
3273    # optical or PCR duplicate
3274    FDUP = 1024
3275    # supplementary alignment
3276    FSUPPLEMENTARY = 2048
3277
3278
3279__all__ = [
3280    "AlignedSegment",
3281    "PileupColumn",
3282    "PileupRead",
3283    "CMATCH",
3284    "CINS",
3285    "CDEL",
3286    "CREF_SKIP",
3287    "CSOFT_CLIP",
3288    "CHARD_CLIP",
3289    "CPAD",
3290    "CEQUAL",
3291    "CDIFF",
3292    "CBACK",
3293    "FPAIRED",
3294    "FPROPER_PAIR",
3295    "FUNMAP",
3296    "FMUNMAP",
3297    "FREVERSE",
3298    "FMREVERSE",
3299    "FREAD1",
3300    "FREAD2",
3301    "FSECONDARY",
3302    "FQCFAIL",
3303    "FDUP",
3304    "FSUPPLEMENTARY",
3305    "KEY_NAMES"]
3306