1# cython: embedsignature=True
2# cython: profile=True
3########################################################
4########################################################
5# Cython wrapper for SAM/BAM/CRAM files based on htslib
6########################################################
7# The principal classes defined in this module are:
8#
9# class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
10#
11# class AlignmentHeader manage SAM/BAM/CRAM header data
12#
13# class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
14#                       the original sort order intact
15#
16# Additionally this module defines numerous additional classes that
17# are part of the internal API. These are:
18#
19# Various iterator classes to iterate over alignments in sequential
20# (IteratorRow) or in a stacked fashion (IteratorColumn):
21#
22# class IteratorRow
23# class IteratorRowRegion
24# class IteratorRowHead
25# class IteratorRowAll
26# class IteratorRowAllRefs
27# class IteratorRowSelection
28# class IteratorColumn
29# class IteratorColumnRegion
30# class IteratorColumnAll
31# class IteratorColumnAllRefs
32#
33########################################################
34#
35# The MIT License
36#
37# Copyright (c) 2015 Andreas Heger
38#
39# Permission is hereby granted, free of charge, to any person obtaining a
40# copy of this software and associated documentation files (the "Software"),
41# to deal in the Software without restriction, including without limitation
42# the rights to use, copy, modify, merge, publish, distribute, sublicense,
43# and/or sell copies of the Software, and to permit persons to whom the
44# Software is furnished to do so, subject to the following conditions:
45#
46# The above copyright notice and this permission notice shall be included in
47# all copies or substantial portions of the Software.
48#
49# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
52# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
55# DEALINGS IN THE SOFTWARE.
56#
57########################################################
58import os
59import collections
60try:
61    from collections.abc import Sequence, Mapping  # noqa
62except ImportError:
63    from collections import Sequence, Mapping  # noqa
64import re
65import warnings
66import array
67from libc.errno  cimport errno, EPIPE
68from libc.string cimport strcmp, strpbrk, strerror
69from libc.stdint cimport INT32_MAX
70
71from cpython cimport array as c_array
72from cpython.version cimport PY_MAJOR_VERSION
73
74from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
75from pysam.libcutils cimport encode_filename, from_string_and_size
76from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
77from pysam.libchtslib cimport HTSFile, hisremote
78
79if PY_MAJOR_VERSION >= 3:
80    from io import StringIO
81else:
82    from StringIO import StringIO
83
84cimport cython
85
86
87__all__ = [
88    "AlignmentFile",
89    "AlignmentHeader",
90    "IteratorRow",
91    "IteratorColumn",
92    "IndexedReads"]
93
94IndexStats = collections.namedtuple("IndexStats",
95                                    ("contig",
96                                     "mapped",
97                                     "unmapped",
98                                     "total"))
99
100########################################################
101## global variables
102# maximum genomic coordinace
103# for some reason, using 'int' causes overflow
104cdef int MAX_POS = (1 << 31) - 1
105
106# valid types for SAM headers
107VALID_HEADER_TYPES = {"HD" : Mapping,
108                      "SQ" : Sequence,
109                      "RG" : Sequence,
110                      "PG" : Sequence,
111                      "CO" : Sequence}
112
113# order of records within SAM headers
114VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
115
116# default type conversions within SAM header records
117KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
118                       "SQ" : {"SN" : str, "LN" : int, "AS" : str,
119                               "M5" : str, "SP" : str, "UR" : str,
120                               "AH" : str,},
121                       "RG" : {"ID" : str, "CN" : str, "DS" : str,
122                               "DT" : str, "FO" : str, "KS" : str,
123                               "LB" : str, "PG" : str, "PI" : str,
124                               "PL" : str, "PM" : str, "PU" : str,
125                               "SM" : str,},
126                       "PG" : {"ID" : str, "PN" : str, "CL" : str,
127                               "PP" : str, "DS" : str, "VN" : str,},}
128
129# output order of fields within records. Ensure that CL is at
130# the end as parsing a CL will ignore any subsequent records.
131VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
132                      "SQ" : ("SN", "LN", "AS", "M5",
133                               "UR", "SP", "AH"),
134                      "RG" : ("ID", "CN", "SM", "LB",
135                              "PU", "PI", "DT", "DS",
136                              "PL", "FO", "KS", "PG",
137                              "PM"),
138                      "PG" : ("PN", "ID", "VN", "PP",
139                              "DS", "CL"),}
140
141
142def build_header_line(fields, record):
143    '''build a header line from `fields` dictionary for `record`'''
144
145    # TODO: add checking for field and sort order
146    line = ["@%s" % record]
147        # comment
148    if record == "CO":
149        line.append(fields)
150    # user tags
151    elif record.islower():
152        for key in sorted(fields):
153            line.append("%s:%s" % (key, str(fields[key])))
154    # defined tags
155    else:
156        # write fields of the specification
157        for key in VALID_HEADER_ORDER[record]:
158            if key in fields:
159                line.append("%s:%s" % (key, str(fields[key])))
160        # write user fields
161        for key in fields:
162            if not key.isupper():
163                line.append("%s:%s" % (key, str(fields[key])))
164
165    return "\t".join(line)
166
167
168cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
169    if not hdr:
170        raise ValueError('cannot create AlignmentHeader, received NULL pointer')
171
172    # check: is AlignmetHeader.__cinit__ called?
173    cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
174    header.ptr = hdr
175
176    return header
177
178def read_failure_reason(code):
179    if code == -2:
180        return 'truncated file'
181    else:
182        return "error {} while reading file".format(code)
183
184
185# the following should be class-method for VariantHeader, but cdef @classmethods
186# are not implemented in cython.
187cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
188                                        reference_names,
189                                        reference_lengths,
190                                        add_sq_text=True,
191                                        text=None) except -1:
192    """build header from list of reference names and lengths.
193    """
194
195cdef class AlignmentHeader(object):
196    """header information for a :class:`AlignmentFile` object
197
198    Parameters
199    ----------
200    header_dict : dict
201        build header from a multi-level dictionary. The
202        first level are the four types ('HD', 'SQ', ...). The second
203        level are a list of lines, with each line being a list of
204        tag-value pairs. The header is constructed first from all the
205        defined fields, followed by user tags in alphabetical
206        order. Alternatively, an :class:`~pysam.AlignmentHeader`
207        object can be passed directly.
208
209    text : string
210        use the string provided as the header
211
212    reference_names : list
213        see reference_lengths
214
215    reference_lengths : list
216        build header from list of chromosome names and lengths.  By
217        default, 'SQ' and 'LN' tags will be added to the header
218        text. This option can be changed by unsetting the flag
219        `add_sq_text`.
220
221    add_sq_text : bool
222        do not add 'SQ' and 'LN' tags to header. This option permits
223        construction :term:`SAM` formatted files without a header.
224
225    """
226
227    # See makeVariantHeader for C constructor
228    def __cinit__(self):
229        self.ptr = NULL
230
231    # Python constructor
232    def __init__(self):
233        self.ptr = bam_hdr_init()
234        if self.ptr is NULL:
235            raise MemoryError("could not create header")
236
237    @classmethod
238    def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
239
240        cdef AlignmentHeader self = AlignmentHeader()
241        cdef char *ctext
242        cdef int l_text
243        cdef int n, x
244        if text is not None:
245            btext = force_bytes(text)
246            ctext = btext
247            l_text = len(btext)
248            self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
249            if self.ptr.text == NULL:
250                raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
251            self.ptr.l_text = l_text
252            memcpy(self.ptr.text, ctext, l_text + 1)
253
254        if reference_names and reference_lengths:
255            reference_names = [force_bytes(ref) for ref in reference_names]
256
257            self.ptr.n_targets = len(reference_names)
258
259            n = sum([len(reference_names) + 1])
260            self.ptr.target_name = <char**>calloc(n, sizeof(char*))
261            if self.ptr.target_name == NULL:
262                raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
263
264            self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
265            if self.ptr.target_len == NULL:
266                raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
267
268            for x from 0 <= x < self.ptr.n_targets:
269                self.ptr.target_len[x] = reference_lengths[x]
270                name = reference_names[x]
271                self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
272                if self.ptr.target_name[x] == NULL:
273                    raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
274                strncpy(self.ptr.target_name[x], name, len(name))
275
276        return self
277
278    @classmethod
279    def from_text(cls, text):
280
281        reference_names, reference_lengths = [], []
282        for line in text.splitlines():
283            if line.startswith("@SQ"):
284                fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
285                try:
286                    reference_names.append(fields["SN"])
287                    reference_lengths.append(int(fields["LN"]))
288                except KeyError:
289                    raise KeyError("incomplete sequence information in '%s'" % str(fields))
290                except ValueError:
291                    raise ValueError("wrong sequence information in '%s'" % str(fields))
292
293        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
294
295    @classmethod
296    def from_dict(cls, header_dict):
297
298        cdef list lines = []
299        # first: defined tags
300        for record in VALID_HEADERS:
301            if record in header_dict:
302                data = header_dict[record]
303                if not isinstance(data, VALID_HEADER_TYPES[record]):
304                    raise ValueError(
305                        "invalid type for record %s: %s, expected %s".format(
306                            record, type(data), VALID_HEADER_TYPES[record]))
307                if isinstance(data, Mapping):
308                    lines.append(build_header_line(data, record))
309                else:
310                    for fields in header_dict[record]:
311                        lines.append(build_header_line(fields, record))
312
313        # then: user tags (lower case), sorted alphabetically
314        for record, data in sorted(header_dict.items()):
315            if record in VALID_HEADERS:
316                continue
317            if isinstance(data, Mapping):
318                lines.append(build_header_line(data, record))
319            else:
320                for fields in header_dict[record]:
321                    lines.append(build_header_line(fields, record))
322
323        text = "\n".join(lines) + "\n"
324
325        reference_names, reference_lengths = [], []
326        if "SQ" in header_dict:
327            for fields in header_dict["SQ"]:
328                try:
329                    reference_names.append(fields["SN"])
330                    reference_lengths.append(fields["LN"])
331                except KeyError:
332                    raise KeyError("incomplete sequence information in '%s'" % str(fields))
333
334        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
335
336    @classmethod
337    def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
338
339        if len(reference_names) != len(reference_lengths):
340            raise ValueError("number of reference names and lengths do not match")
341
342        # optionally, if there is no text, add a SAM compatible header to output file.
343        if text is None and add_sq_text:
344            text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
345                reference_names, reference_lengths)])
346
347        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
348
349    def __dealloc__(self):
350        bam_hdr_destroy(self.ptr)
351        self.ptr = NULL
352
353    def __bool__(self):
354        return self.ptr != NULL
355
356    def copy(self):
357        return makeAlignmentHeader(bam_hdr_dup(self.ptr))
358
359    property nreferences:
360        """int with the number of :term:`reference` sequences in the file.
361
362        This is a read-only attribute."""
363        def __get__(self):
364            return self.ptr.n_targets
365
366    property references:
367        """tuple with the names of :term:`reference` sequences. This is a
368        read-only attribute"""
369        def __get__(self):
370            t = []
371            cdef int x
372            for x in range(self.ptr.n_targets):
373                t.append(charptr_to_str(self.ptr.target_name[x]))
374            return tuple(t)
375
376    property lengths:
377        """tuple of the lengths of the :term:`reference` sequences. This is a
378        read-only attribute. The lengths are in the same order as
379        :attr:`pysam.AlignmentFile.references`
380        """
381        def __get__(self):
382            t = []
383            cdef int x
384            for x in range(self.ptr.n_targets):
385                t.append(self.ptr.target_len[x])
386            return tuple(t)
387
388    def _build_sequence_section(self):
389        """return sequence section of header.
390
391        The sequence section is built from the list of reference names and
392        lengths stored in the BAM-file and not from any @SQ entries that
393        are part of the header's text section.
394        """
395
396        cdef int x
397        text = []
398        for x in range(self.ptr.n_targets):
399            text.append("@SQ\tSN:{}\tLN:{}\n".format(
400                force_str(self.ptr.target_name[x]),
401                self.ptr.target_len[x]))
402        return "".join(text)
403
404    def to_dict(self):
405        """return two-level dictionary with header information from the file.
406
407        The first level contains the record (``HD``, ``SQ``, etc) and
408        the second level contains the fields (``VN``, ``LN``, etc).
409
410        The parser is validating and will raise an AssertionError if
411        if encounters any record or field tags that are not part of
412        the SAM specification. Use the
413        :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
414        header.
415
416        The parsing follows the SAM format specification with the
417        exception of the ``CL`` field. This option will consume the
418        rest of a header line irrespective of any additional fields.
419        This behaviour has been added to accommodate command line
420        options that contain characters that are not valid field
421        separators.
422
423        If no @SQ entries are within the text section of the header,
424        this will be automatically added from the reference names and
425        lengths stored in the binary part of the header.
426        """
427        result = collections.OrderedDict()
428
429        # convert to python string
430        t = self.__str__()
431        for line in t.split("\n"):
432            line = line.strip(' \0')
433            if not line:
434                continue
435            assert line.startswith("@"), \
436                "header line without '@': '%s'" % line
437            fields = line[1:].split("\t")
438            record = fields[0]
439            assert record in VALID_HEADER_TYPES, \
440                "header line with invalid type '%s': '%s'" % (record, line)
441
442            # treat comments
443            if record == "CO":
444                if record not in result:
445                    result[record] = []
446                result[record].append("\t".join( fields[1:]))
447                continue
448            # the following is clumsy as generators do not work?
449            x = {}
450
451            for idx, field in enumerate(fields[1:]):
452                if ":" not in field:
453                    raise ValueError("malformatted header: no ':' in field" )
454                key, value = field.split(":", 1)
455                if key in ("CL",):
456                    # special treatment for command line
457                    # statements (CL). These might contain
458                    # characters that are non-conformant with
459                    # the valid field separators in the SAM
460                    # header. Thus, in contravention to the
461                    # SAM API, consume the rest of the line.
462                    key, value = "\t".join(fields[idx+1:]).split(":", 1)
463                    x[key] = KNOWN_HEADER_FIELDS[record][key](value)
464                    break
465
466                # interpret type of known header record tags, default to str
467                x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
468
469            if VALID_HEADER_TYPES[record] == Mapping:
470                if record in result:
471                    raise ValueError(
472                        "multiple '%s' lines are not permitted" % record)
473
474                result[record] = x
475            elif VALID_HEADER_TYPES[record] == Sequence:
476                if record not in result: result[record] = []
477                result[record].append(x)
478
479        # if there are no SQ lines in the header, add the
480        # reference names from the information in the bam
481        # file.
482        #
483        # Background: c-samtools keeps the textual part of the
484        # header separate from the list of reference names and
485        # lengths. Thus, if a header contains only SQ lines,
486        # the SQ information is not part of the textual header
487        # and thus are missing from the output. See issue 84.
488        if "SQ" not in result:
489            sq = []
490            for ref, length in zip(self.references, self.lengths):
491                sq.append({'LN': length, 'SN': ref })
492            result["SQ"] = sq
493
494        return result
495
496    def as_dict(self):
497        """deprecated: use :meth:`to_dict()`"""
498        return self.to_dict()
499
500    def get_reference_name(self, tid):
501        if tid == -1:
502            return None
503        if not 0 <= tid < self.ptr.n_targets:
504            raise ValueError("reference_id %i out of range 0<=tid<%i" %
505                             (tid, self.ptr.n_targets))
506        return charptr_to_str(self.ptr.target_name[tid])
507
508    def get_reference_length(self, reference):
509        cdef int tid = self.get_tid(reference)
510        if tid < 0:
511            raise KeyError("unknown reference {}".format(reference))
512        else:
513            return self.ptr.target_len[tid]
514
515    def is_valid_tid(self, int tid):
516        """
517        return True if the numerical :term:`tid` is valid; False otherwise.
518
519        Note that the unmapped tid code (-1) counts as an invalid.
520        """
521        return 0 <= tid < self.ptr.n_targets
522
523    def get_tid(self, reference):
524        """
525        return the numerical :term:`tid` corresponding to
526        :term:`reference`
527
528        returns -1 if reference is not known.
529        """
530        reference = force_bytes(reference)
531        tid = bam_name2id(self.ptr, reference)
532        if tid < -1:
533            raise ValueError('could not parse header')
534        return tid
535
536    def __str__(self):
537        '''string with the full contents of the :term:`sam file` header as a
538        string.
539
540        If no @SQ entries are within the text section of the header,
541        this will be automatically added from the reference names and
542        lengths stored in the binary part of the header.
543
544        See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
545        representation of the header.
546        '''
547        text = from_string_and_size(self.ptr.text, self.ptr.l_text)
548        if "@SQ" not in text:
549            text += "\n" + self._build_sequence_section()
550        return text
551
552    # dictionary access methods, for backwards compatibility.
553    def __setitem__(self, key, value):
554        raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
555
556    def __getitem__(self, key):
557        return self.to_dict().__getitem__(key)
558
559    def items(self):
560        return self.to_dict().items()
561
562    # PY2 compatibility
563    def iteritems(self):
564        return self.to_dict().items()
565
566    def keys(self):
567        return self.to_dict().keys()
568
569    def values(self):
570        return self.to_dict().values()
571
572    def get(self, *args):
573        return self.to_dict().get(*args)
574
575    def __len__(self):
576        return self.to_dict().__len__()
577
578    def __contains__(self, key):
579        return self.to_dict().__contains__(key)
580
581
582cdef class AlignmentFile(HTSFile):
583    """AlignmentFile(filepath_or_object, mode=None, template=None,
584    reference_names=None, reference_lengths=None, text=NULL,
585    header=None, add_sq_text=False, check_header=True, check_sq=True,
586    reference_filename=None, filename=None, index_filename=None,
587    filepath_index=None, require_index=False, duplicate_filehandle=True,
588    ignore_truncation=False, threads=1)
589
590    A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
591
592    If `filepath_or_object` is a string, the file is automatically
593    opened. If `filepath_or_object` is a python File object, the
594    already opened file will be used.
595
596    If the file is opened for reading and an index exists (if file is BAM, a
597    .bai file or if CRAM a .crai file), it will be opened automatically.
598    `index_filename` may be specified explicitly. If the index is not named
599    in the standard manner, not located in the same directory as the
600    BAM/CRAM file, or is remote.  Without an index, random access via
601    :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
602    is disabled.
603
604    For writing, the header of a :term:`SAM` file/:term:`BAM` file can
605    be constituted from several sources (see also the samtools format
606    specification):
607
608        1. If `template` is given, the header is copied from a another
609           `AlignmentFile` (`template` must be a
610           :class:`~pysam.AlignmentFile`).
611
612        2. If `header` is given, the header is built from a
613           multi-level dictionary.
614
615        3. If `text` is given, new header text is copied from raw
616           text.
617
618        4. The names (`reference_names`) and lengths
619           (`reference_lengths`) are supplied directly as lists.
620
621    When reading or writing a CRAM file, the filename of a FASTA-formatted
622    reference can be specified with `reference_filename`.
623
624    By default, if a file is opened in mode 'r', it is checked
625    for a valid header (`check_header` = True) and a definition of
626    chromosome names (`check_sq` = True).
627
628    Parameters
629    ----------
630    mode : string
631        `mode` should be ``r`` for reading or ``w`` for writing. The
632        default is text mode (:term:`SAM`). For binary (:term:`BAM`)
633        I/O you should append ``b`` for compressed or ``u`` for
634        uncompressed :term:`BAM` output.  Use ``h`` to output header
635        information in text (:term:`TAM`) mode. Use ``c`` for
636        :term:`CRAM` formatted files.
637
638        If ``b`` is present, it must immediately follow ``r`` or
639        ``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
640        ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
641        :term:`BAM` formatted file for reading, type::
642
643           f = pysam.AlignmentFile('ex1.bam','rb')
644
645        If mode is not specified, the method will try to auto-detect
646        in the order 'rb', 'r', thus both the following should work::
647
648            f1 = pysam.AlignmentFile('ex1.bam')
649            f2 = pysam.AlignmentFile('ex1.sam')
650
651    template : AlignmentFile
652        when writing, copy header from file `template`.
653
654    header :  dict or AlignmentHeader
655        when writing, build header from a multi-level dictionary. The
656        first level are the four types ('HD', 'SQ', ...). The second
657        level are a list of lines, with each line being a list of
658        tag-value pairs. The header is constructed first from all the
659        defined fields, followed by user tags in alphabetical
660        order. Alternatively, an :class:`~pysam.AlignmentHeader`
661        object can be passed directly.
662
663    text : string
664        when writing, use the string provided as the header
665
666    reference_names : list
667        see reference_lengths
668
669    reference_lengths : list
670        when writing or opening a SAM file without header build header
671        from list of chromosome names and lengths.  By default, 'SQ'
672        and 'LN' tags will be added to the header text. This option
673        can be changed by unsetting the flag `add_sq_text`.
674
675    add_sq_text : bool
676        do not add 'SQ' and 'LN' tags to header. This option permits
677        construction :term:`SAM` formatted files without a header.
678
679    add_sam_header : bool
680        when outputting SAM the default is to output a header. This is
681        equivalent to opening the file in 'wh' mode. If this option is
682        set to False, no header will be output. To read such a file,
683        set `check_header=False`.
684
685    check_header : bool
686        obsolete: when reading a SAM file, check if header is present
687        (default=True)
688
689    check_sq : bool
690        when reading, check if SQ entries are present in header
691        (default=True)
692
693    reference_filename : string
694        Path to a FASTA-formatted reference file. Valid only for CRAM files.
695        When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
696        specified in the header (``UR`` tag), which are normally used to find
697        the reference.
698
699    index_filename : string
700        Explicit path to the index file.  Only needed if the index is not
701        named in the standard manner, not located in the same directory as
702        the BAM/CRAM file, or is remote.  An IOError is raised if the index
703        cannot be found or is invalid.
704
705    filepath_index : string
706        Alias for `index_filename`.
707
708    require_index : bool
709        When reading, require that an index file is present and is valid or
710        raise an IOError.  (default=False)
711
712    filename : string
713        Alternative to filepath_or_object. Filename of the file
714        to be opened.
715
716    duplicate_filehandle: bool
717        By default, file handles passed either directly or through
718        File-like objects will be duplicated before passing them to
719        htslib. The duplication prevents issues where the same stream
720        will be closed by htslib and through destruction of the
721        high-level python object. Set to False to turn off
722        duplication.
723
724    ignore_truncation: bool
725        Issue a warning, instead of raising an error if the current file
726        appears to be truncated due to a missing EOF marker.  Only applies
727        to bgzipped formats. (Default=False)
728
729    format_options: list
730        A list of key=value strings, as accepted by --input-fmt-option and
731        --output-fmt-option in samtools.
732    threads: integer
733        Number of threads to use for compressing/decompressing BAM/CRAM files.
734        Setting threads to > 1 cannot be combined with `ignore_truncation`.
735        (Default=1)
736    """
737
738    def __cinit__(self, *args, **kwargs):
739        self.htsfile = NULL
740        self.filename = None
741        self.mode = None
742        self.threads = 1
743        self.is_stream = False
744        self.is_remote = False
745        self.index = NULL
746
747        if "filename" in kwargs:
748            args = [kwargs["filename"]]
749            del kwargs["filename"]
750
751        self._open(*args, **kwargs)
752
753        # allocate memory for iterator
754        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
755        if self.b == NULL:
756            raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
757
758    def has_index(self):
759        """return true if htsfile has an existing (and opened) index.
760        """
761        return self.index != NULL
762
763    def check_index(self):
764        """return True if index is present.
765
766        Raises
767        ------
768
769        AttributeError
770            if htsfile is :term:`SAM` formatted and thus has no index.
771
772        ValueError
773            if htsfile is closed or index could not be opened.
774        """
775
776        if not self.is_open:
777            raise ValueError("I/O operation on closed file")
778        if not self.is_bam and not self.is_cram:
779            raise AttributeError(
780                "AlignmentFile.mapped only available in bam files")
781        if self.index == NULL:
782            raise ValueError(
783                "mapping information not recorded in index "
784                "or index not available")
785        return True
786
787    def _open(self,
788              filepath_or_object,
789              mode=None,
790              AlignmentFile template=None,
791              reference_names=None,
792              reference_lengths=None,
793              reference_filename=None,
794              text=None,
795              header=None,
796              port=None,
797              add_sq_text=True,
798              add_sam_header=True,
799              check_header=True,
800              check_sq=True,
801              index_filename=None,
802              filepath_index=None,
803              require_index=False,
804              referencenames=None,
805              referencelengths=None,
806              duplicate_filehandle=True,
807              ignore_truncation=False,
808              format_options=None,
809              threads=1):
810        '''open a sam, bam or cram formatted file.
811
812        If _open is called on an existing file, the current file
813        will be closed and a new file will be opened.
814
815        '''
816        cdef char *cfilename = NULL
817        cdef char *creference_filename = NULL
818        cdef char *cindexname = NULL
819        cdef char *cmode = NULL
820        cdef bam_hdr_t * hdr = NULL
821
822        if threads > 1 and ignore_truncation:
823           # This won't raise errors if reaching a truncated alignment,
824           # because bgzf_mt_reader in htslib does not deal with
825           # bgzf_mt_read_block returning non-zero values, contrary
826           # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
827           # Better to avoid this (for now) than to produce seemingly correct results.
828           raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
829        self.threads = threads
830
831        # for backwards compatibility:
832        if referencenames is not None:
833            reference_names = referencenames
834        if referencelengths is not None:
835            reference_lengths = referencelengths
836
837        # close a previously opened file
838        if self.is_open:
839            self.close()
840
841        # autodetection for read
842        if mode is None:
843            mode = "r"
844
845        if add_sam_header and mode == "w":
846            mode = "wh"
847
848        assert mode in ("r", "w", "rb", "wb", "wh",
849                        "wbu", "rU", "wb0",
850                        "rc", "wc"), \
851            "invalid file opening mode `%s`" % mode
852
853        self.duplicate_filehandle = duplicate_filehandle
854
855        # StringIO not supported
856        if isinstance(filepath_or_object, StringIO):
857            raise NotImplementedError(
858                "access from StringIO objects not supported")
859        # reading from a file descriptor
860        elif isinstance(filepath_or_object, int):
861            self.filename = filepath_or_object
862            filename = None
863            self.is_remote = False
864            self.is_stream = True
865        # reading from a File object or other object with fileno
866        elif hasattr(filepath_or_object, "fileno"):
867            if filepath_or_object.closed:
868                raise ValueError('I/O operation on closed file')
869            self.filename = filepath_or_object
870            # .name can be TextIOWrapper
871            try:
872                filename = encode_filename(str(filepath_or_object.name))
873                cfilename = filename
874            except AttributeError:
875                filename = None
876            self.is_remote = False
877            self.is_stream = True
878        # what remains is a filename
879        else:
880            self.filename = filename = encode_filename(filepath_or_object)
881            cfilename = filename
882            self.is_remote = hisremote(cfilename)
883            self.is_stream = self.filename == b'-'
884
885        # for htslib, wbu seems to not work
886        if mode == "wbu":
887            mode = "wb0"
888
889        self.mode = force_bytes(mode)
890        self.reference_filename = reference_filename = encode_filename(
891            reference_filename)
892
893        if mode[0] == 'w':
894            # open file for writing
895
896            if not (template or header or text or (reference_names and reference_lengths)):
897                raise ValueError(
898                    "either supply options `template`, `header`, `text` or  both `reference_names` "
899                    "and `reference_lengths` for writing")
900
901            if template:
902                # header is copied, though at the moment not strictly
903                # necessary as AlignmentHeader is immutable.
904                self.header = template.header.copy()
905            elif isinstance(header, AlignmentHeader):
906                self.header = header.copy()
907            elif isinstance(header, Mapping):
908                self.header = AlignmentHeader.from_dict(header)
909            elif reference_names and reference_lengths:
910                self.header = AlignmentHeader.from_references(
911                    reference_names,
912                    reference_lengths,
913                    add_sq_text=add_sq_text,
914                    text=text)
915            elif text:
916                self.header = AlignmentHeader.from_text(text)
917            else:
918                raise ValueError("not enough information to construct header. Please provide template, "
919                                 "header, text or reference_names/reference_lengths")
920            self.htsfile = self._open_htsfile()
921
922            if self.htsfile == NULL:
923                if errno:
924                    raise IOError(errno, "could not open alignment file `{}`: {}".format(
925                        force_str(filename),
926                        force_str(strerror(errno))))
927                else:
928                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
929            if format_options and len(format_options):
930                self.add_hts_options(format_options)
931            # set filename with reference sequences. If no filename
932            # is given, the CRAM reference arrays will be built from
933            # the @SQ header in the header
934            if "c" in mode and reference_filename:
935                if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
936                    raise ValueError("failure when setting reference filename")
937
938            # write header to htsfile
939            if "b" in mode or "c" in mode or "h" in mode:
940                hdr = self.header.ptr
941                with nogil:
942                    sam_hdr_write(self.htsfile, hdr)
943
944        elif mode[0] == "r":
945            # open file for reading
946            self.htsfile = self._open_htsfile()
947
948            if self.htsfile == NULL:
949                if errno:
950                    raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
951                                  force_str(strerror(errno))))
952                else:
953                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
954
955            if self.htsfile.format.category != sequence_data:
956                raise ValueError("file does not contain alignment data")
957
958            if format_options and len(format_options):
959                self.add_hts_options(format_options)
960
961            self.check_truncation(ignore_truncation)
962
963            # bam/cram files require a valid header
964            if self.is_bam or self.is_cram:
965                with nogil:
966                    hdr = sam_hdr_read(self.htsfile)
967                if hdr == NULL:
968                    raise ValueError(
969                        "file does not have a valid header (mode='%s') "
970                        "- is it BAM/CRAM format?" % mode)
971                self.header = makeAlignmentHeader(hdr)
972            else:
973                # in sam files a header is optional. If not given,
974                # user may provide reference names and lengths to built
975                # an on-the-fly header.
976                if reference_names and reference_lengths:
977                    # build header from a target names and lengths
978                    self.header = AlignmentHeader.from_references(
979                        reference_names=reference_names,
980                        reference_lengths=reference_lengths,
981                        add_sq_text=add_sq_text,
982                        text=text)
983                else:
984                    with nogil:
985                        hdr = sam_hdr_read(self.htsfile)
986                    if hdr == NULL:
987                        raise ValueError(
988                            "SAM? file does not have a valid header (mode='%s'), "
989                            "please provide reference_names and reference_lengths")
990                    self.header = makeAlignmentHeader(hdr)
991
992            # set filename with reference sequences
993            if self.is_cram and reference_filename:
994                creference_filename = self.reference_filename
995                hts_set_opt(self.htsfile,
996                            CRAM_OPT_REFERENCE,
997                            creference_filename)
998
999            if check_sq and self.header.nreferences == 0:
1000                raise ValueError(
1001                    ("file has no sequences defined (mode='%s') - "
1002                     "is it SAM/BAM format? Consider opening with "
1003                     "check_sq=False") % mode)
1004
1005            if self.is_bam or self.is_cram:
1006                self.index_filename = index_filename or filepath_index
1007                if self.index_filename:
1008                    cindexname = bfile_name = encode_filename(self.index_filename)
1009
1010                if cfilename or cindexname:
1011                    with nogil:
1012                        self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
1013
1014                    if not self.index and (cindexname or require_index):
1015                        if errno:
1016                            raise IOError(errno, force_str(strerror(errno)))
1017                        else:
1018                            raise IOError('unable to open index file `%s`' % self.index_filename)
1019
1020                elif require_index:
1021                    raise IOError('unable to open index file')
1022
1023                # save start of data section
1024                if not self.is_stream:
1025                    self.start_offset = self.tell()
1026
1027    def fetch(self,
1028              contig=None,
1029              start=None,
1030              stop=None,
1031              region=None,
1032              tid=None,
1033              until_eof=False,
1034              multiple_iterators=False,
1035              reference=None,
1036              end=None):
1037        """fetch reads aligned in a :term:`region`.
1038
1039        See :meth:`~pysam.HTSFile.parse_region` for more information
1040        on how genomic regions can be specified. :term:`reference` and
1041        `end` are also accepted for backward compatibility as synonyms
1042        for :term:`contig` and `stop`, respectively.
1043
1044        Without a `contig` or `region` all mapped reads in the file
1045        will be fetched. The reads will be returned ordered by reference
1046        sequence, which will not necessarily be the order within the
1047        file. This mode of iteration still requires an index. If there is
1048        no index, use `until_eof=True`.
1049
1050        If only `contig` is set, all reads aligned to `contig`
1051        will be fetched.
1052
1053        A :term:`SAM` file does not allow random access. If `region`
1054        or `contig` are given, an exception is raised.
1055
1056        Parameters
1057        ----------
1058
1059        until_eof : bool
1060
1061           If `until_eof` is True, all reads from the current file
1062           position will be returned in order as they are within the
1063           file. Using this option will also fetch unmapped reads.
1064
1065        multiple_iterators : bool
1066
1067           If `multiple_iterators` is True, multiple
1068           iterators on the same file can be used at the same time. The
1069           iterator returned will receive its own copy of a filehandle to
1070           the file effectively re-opening the file. Re-opening a file
1071           creates some overhead, so beware.
1072
1073        Returns
1074        -------
1075
1076        An iterator over a collection of reads.
1077
1078        Raises
1079        ------
1080
1081        ValueError
1082            if the genomic coordinates are out of range or invalid or the
1083            file does not permit random access to genomic coordinates.
1084
1085        """
1086        cdef int rtid, rstart, rstop, has_coord
1087
1088        if not self.is_open:
1089            raise ValueError( "I/O operation on closed file" )
1090
1091        has_coord, rtid, rstart, rstop = self.parse_region(
1092            contig, start, stop, region, tid,
1093            end=end, reference=reference)
1094
1095       # Turn of re-opening if htsfile is a stream
1096        if self.is_stream:
1097            multiple_iterators = False
1098
1099        if self.is_bam or self.is_cram:
1100            if not until_eof and not self.is_remote:
1101                if not self.has_index():
1102                    raise ValueError(
1103                        "fetch called on bamfile without index")
1104
1105            if has_coord:
1106                return IteratorRowRegion(
1107                    self, rtid, rstart, rstop,
1108                    multiple_iterators=multiple_iterators)
1109            else:
1110                if until_eof:
1111                    return IteratorRowAll(
1112                        self,
1113                        multiple_iterators=multiple_iterators)
1114                else:
1115                    # AH: check - reason why no multiple_iterators for
1116                    # AllRefs?
1117                    return IteratorRowAllRefs(
1118                        self,
1119                        multiple_iterators=multiple_iterators)
1120        else:
1121            if has_coord:
1122                raise ValueError(
1123                    "fetching by region is not available for SAM files")
1124
1125            if multiple_iterators == True:
1126                raise ValueError(
1127                    "multiple iterators not implemented for SAM files")
1128
1129            return IteratorRowAll(self,
1130                                  multiple_iterators=multiple_iterators)
1131
1132    def head(self, n, multiple_iterators=True):
1133        '''return an iterator over the first n alignments.
1134
1135        This iterator is is useful for inspecting the bam-file.
1136
1137        Parameters
1138        ----------
1139
1140        multiple_iterators : bool
1141
1142            is set to True by default in order to
1143            avoid changing the current file position.
1144
1145        Returns
1146        -------
1147
1148        an iterator over a collection of reads
1149
1150        '''
1151        return IteratorRowHead(self, n,
1152                               multiple_iterators=multiple_iterators)
1153
1154    def mate(self, AlignedSegment read):
1155        '''return the mate of :class:`~pysam.AlignedSegment` `read`.
1156
1157        .. note::
1158
1159            Calling this method will change the file position.
1160            This might interfere with any iterators that have
1161            not re-opened the file.
1162
1163        .. note::
1164
1165           This method is too slow for high-throughput processing.
1166           If a read needs to be processed with its mate, work
1167           from a read name sorted file or, better, cache reads.
1168
1169        Returns
1170        -------
1171
1172        :class:`~pysam.AlignedSegment` : the mate
1173
1174        Raises
1175        ------
1176
1177        ValueError
1178            if the read is unpaired or the mate is unmapped
1179
1180        '''
1181        cdef uint32_t flag = read._delegate.core.flag
1182
1183        if flag & BAM_FPAIRED == 0:
1184            raise ValueError("read %s: is unpaired" %
1185                             (read.query_name))
1186        if flag & BAM_FMUNMAP != 0:
1187            raise ValueError("mate %s: is unmapped" %
1188                             (read.query_name))
1189
1190        # xor flags to get the other mate
1191        cdef int x = BAM_FREAD1 + BAM_FREAD2
1192        flag = (flag ^ x) & x
1193
1194        # Make sure to use a separate file to jump around
1195        # to mate as otherwise the original file position
1196        # will be lost
1197        # The following code is not using the C API and
1198        # could thus be made much quicker, for example
1199        # by using tell and seek.
1200        for mate in self.fetch(
1201                read._delegate.core.mpos,
1202                read._delegate.core.mpos + 1,
1203                tid=read._delegate.core.mtid,
1204                multiple_iterators=True):
1205            if mate.flag & flag != 0 and \
1206               mate.query_name == read.query_name:
1207                break
1208        else:
1209            raise ValueError("mate not found")
1210
1211        return mate
1212
1213    def pileup(self,
1214               contig=None,
1215               start=None,
1216               stop=None,
1217               region=None,
1218               reference=None,
1219               end=None,
1220               **kwargs):
1221        """perform a :term:`pileup` within a :term:`region`. The region is
1222        specified by :term:`contig`, `start` and `stop` (using
1223        0-based indexing).  :term:`reference` and `end` are also accepted for
1224        backward compatibility as synonyms for :term:`contig` and `stop`,
1225        respectively.  Alternatively, a samtools 'region' string
1226        can be supplied.
1227
1228        Without 'contig' or 'region' all reads will be used for the
1229        pileup. The reads will be returned ordered by
1230        :term:`contig` sequence, which will not necessarily be the
1231        order within the file.
1232
1233        Note that :term:`SAM` formatted files do not allow random
1234        access.  In these files, if a 'region' or 'contig' are
1235        given an exception is raised.
1236
1237        .. note::
1238
1239            'all' reads which overlap the region are returned. The
1240            first base returned will be the first base of the first
1241            read 'not' necessarily the first base of the region used
1242            in the query.
1243
1244        Parameters
1245        ----------
1246
1247        truncate : bool
1248
1249           By default, the samtools pileup engine outputs all reads
1250           overlapping a region. If truncate is True and a region is
1251           given, only columns in the exact region specified are
1252           returned.
1253
1254        max_depth : int
1255           Maximum read depth permitted. The default limit is '8000'.
1256
1257        stepper : string
1258           The stepper controls how the iterator advances.
1259           Possible options for the stepper are
1260
1261           ``all``
1262              skip reads in which any of the following flags are set:
1263              BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
1264
1265           ``nofilter``
1266              uses every single read turning off any filtering.
1267
1268           ``samtools``
1269              same filter and read processing as in :term:`csamtools`
1270              pileup. For full compatibility, this requires a
1271              'fastafile' to be given. The following options all pertain
1272              to filtering of the ``samtools`` stepper.
1273
1274        fastafile : :class:`~pysam.FastaFile` object.
1275
1276           This is required for some of the steppers.
1277
1278        ignore_overlaps: bool
1279
1280           If set to True, detect if read pairs overlap and only take
1281           the higher quality base. This is the default.
1282
1283        flag_filter : int
1284
1285           ignore reads where any of the bits in the flag are set. The default is
1286           BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
1287
1288        flag_require : int
1289
1290           only use reads where certain flags are set. The default is 0.
1291
1292        ignore_orphans: bool
1293
1294            ignore orphans (paired reads that are not in a proper pair).
1295            The default is to ignore orphans.
1296
1297        min_base_quality: int
1298
1299           Minimum base quality. Bases below the minimum quality will
1300           not be output. The default is 13.
1301
1302        adjust_capq_threshold: int
1303
1304           adjust mapping quality. The default is 0 for no
1305           adjustment. The recommended value for adjustment is 50.
1306
1307        min_mapping_quality : int
1308
1309           only use reads above a minimum mapping quality. The default is 0.
1310
1311        compute_baq: bool
1312
1313           re-alignment computing per-Base Alignment Qualities (BAQ). The
1314           default is to do re-alignment. Realignment requires a reference
1315           sequence. If none is present, no realignment will be performed.
1316
1317        redo_baq: bool
1318
1319           recompute per-Base Alignment Quality on the fly ignoring
1320           existing base qualities. The default is False (use existing
1321           base qualities).
1322
1323        Returns
1324        -------
1325
1326        an iterator over genomic positions.
1327
1328        """
1329        cdef int rtid, has_coord
1330        cdef int32_t rstart, rstop
1331
1332        if not self.is_open:
1333            raise ValueError("I/O operation on closed file")
1334
1335        has_coord, rtid, rstart, rstop = self.parse_region(
1336            contig, start, stop, region, reference=reference, end=end)
1337
1338        if has_coord:
1339            if not self.has_index():
1340                raise ValueError("no index available for pileup")
1341
1342            return IteratorColumnRegion(self,
1343                                        tid=rtid,
1344                                        start=rstart,
1345                                        stop=rstop,
1346                                        **kwargs)
1347        else:
1348            if self.has_index():
1349                return IteratorColumnAllRefs(self, **kwargs)
1350            else:
1351                return IteratorColumnAll(self, **kwargs)
1352
1353    def count(self,
1354              contig=None,
1355              start=None,
1356              stop=None,
1357              region=None,
1358              until_eof=False,
1359              read_callback="nofilter",
1360              reference=None,
1361              end=None):
1362        '''count the number of reads in :term:`region`
1363
1364        The region is specified by :term:`contig`, `start` and `stop`.
1365        :term:`reference` and `end` are also accepted for backward
1366        compatibility as synonyms for :term:`contig` and `stop`,
1367        respectively.  Alternatively, a :term:`samtools` :term:`region`
1368        string can be supplied.
1369
1370        A :term:`SAM` file does not allow random access and if
1371        `region` or `contig` are given, an exception is raised.
1372
1373        Parameters
1374        ----------
1375
1376        contig : string
1377            reference_name of the genomic region (chromosome)
1378
1379        start : int
1380            start of the genomic region (0-based inclusive)
1381
1382        stop : int
1383            end of the genomic region (0-based exclusive)
1384
1385        region : string
1386            a region string in samtools format.
1387
1388        until_eof : bool
1389            count until the end of the file, possibly including
1390            unmapped reads as well.
1391
1392        read_callback: string or function
1393
1394            select a call-back to ignore reads when counting. It can
1395            be either a string with the following values:
1396
1397            ``all``
1398                skip reads in which any of the following
1399                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
1400                BAM_FDUP
1401
1402            ``nofilter``
1403                uses every single read
1404
1405            Alternatively, `read_callback` can be a function
1406            ``check_read(read)`` that should return True only for
1407            those reads that shall be included in the counting.
1408
1409        reference : string
1410            backward compatible synonym for `contig`
1411
1412        end : int
1413            backward compatible synonym for `stop`
1414
1415        Raises
1416        ------
1417
1418        ValueError
1419            if the genomic coordinates are out of range or invalid.
1420
1421        '''
1422        cdef AlignedSegment read
1423        cdef long counter = 0
1424
1425        if not self.is_open:
1426            raise ValueError("I/O operation on closed file")
1427
1428        cdef int filter_method = 0
1429        if read_callback == "all":
1430            filter_method = 1
1431        elif read_callback == "nofilter":
1432            filter_method = 2
1433
1434        for read in self.fetch(contig=contig,
1435                               start=start,
1436                               stop=stop,
1437                               reference=reference,
1438                               end=end,
1439                               region=region,
1440                               until_eof=until_eof):
1441            # apply filter
1442            if filter_method == 1:
1443                # filter = "all"
1444                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
1445                    continue
1446            elif filter_method == 2:
1447                # filter = "nofilter"
1448                pass
1449            else:
1450                if not read_callback(read):
1451                    continue
1452            counter += 1
1453
1454        return counter
1455
1456    @cython.boundscheck(False)  # we do manual bounds checking
1457    def count_coverage(self,
1458                       contig,
1459                       start=None,
1460                       stop=None,
1461                       region=None,
1462                       quality_threshold=15,
1463                       read_callback='all',
1464                       reference=None,
1465                       end=None):
1466        """count the coverage of genomic positions by reads in :term:`region`.
1467
1468        The region is specified by :term:`contig`, `start` and `stop`.
1469        :term:`reference` and `end` are also accepted for backward
1470        compatibility as synonyms for :term:`contig` and `stop`,
1471        respectively.  Alternatively, a :term:`samtools` :term:`region`
1472        string can be supplied.  The coverage is computed per-base [ACGT].
1473
1474        Parameters
1475        ----------
1476
1477        contig : string
1478            reference_name of the genomic region (chromosome)
1479
1480        start : int
1481            start of the genomic region (0-based inclusive). If not
1482            given, count from the start of the chromosome.
1483
1484        stop : int
1485            end of the genomic region (0-based exclusive). If not given,
1486            count to the end of the chromosome.
1487
1488        region : string
1489            a region string.
1490
1491        quality_threshold : int
1492            quality_threshold is the minimum quality score (in phred) a
1493            base has to reach to be counted.
1494
1495        read_callback: string or function
1496
1497            select a call-back to ignore reads when counting. It can
1498            be either a string with the following values:
1499
1500            ``all``
1501                skip reads in which any of the following
1502                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
1503                BAM_FDUP
1504
1505            ``nofilter``
1506                uses every single read
1507
1508            Alternatively, `read_callback` can be a function
1509            ``check_read(read)`` that should return True only for
1510            those reads that shall be included in the counting.
1511
1512        reference : string
1513            backward compatible synonym for `contig`
1514
1515        end : int
1516            backward compatible synonym for `stop`
1517
1518        Raises
1519        ------
1520
1521        ValueError
1522            if the genomic coordinates are out of range or invalid.
1523
1524        Returns
1525        -------
1526
1527        four array.arrays of the same length in order A C G T : tuple
1528
1529        """
1530
1531        cdef uint32_t contig_length = self.get_reference_length(contig)
1532        cdef int _start = start if start is not None else 0
1533        cdef int _stop = stop if stop is not None else contig_length
1534        _stop = _stop if _stop < contig_length else contig_length
1535
1536        if _stop == _start:
1537            raise ValueError("interval of size 0")
1538        if _stop < _start:
1539            raise ValueError("interval of size less than 0")
1540
1541        cdef int length = _stop - _start
1542        cdef c_array.array int_array_template = array.array('L', [])
1543        cdef c_array.array count_a
1544        cdef c_array.array count_c
1545        cdef c_array.array count_g
1546        cdef c_array.array count_t
1547        count_a = c_array.clone(int_array_template, length, zero=True)
1548        count_c = c_array.clone(int_array_template, length, zero=True)
1549        count_g = c_array.clone(int_array_template, length, zero=True)
1550        count_t = c_array.clone(int_array_template, length, zero=True)
1551
1552        cdef AlignedSegment read
1553        cdef cython.str seq
1554        cdef c_array.array quality
1555        cdef int qpos
1556        cdef int refpos
1557        cdef int c = 0
1558        cdef int filter_method = 0
1559
1560
1561        if read_callback == "all":
1562            filter_method = 1
1563        elif read_callback == "nofilter":
1564            filter_method = 2
1565
1566        cdef int _threshold = quality_threshold or 0
1567        for read in self.fetch(contig=contig,
1568                               reference=reference,
1569                               start=start,
1570                               stop=stop,
1571                               end=end,
1572                               region=region):
1573            # apply filter
1574            if filter_method == 1:
1575                # filter = "all"
1576                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
1577                    continue
1578            elif filter_method == 2:
1579                # filter = "nofilter"
1580                pass
1581            else:
1582                if not read_callback(read):
1583                    continue
1584
1585            # count
1586            seq = read.seq
1587            if seq is None:
1588                continue
1589            quality = read.query_qualities
1590
1591            for qpos, refpos in read.get_aligned_pairs(True):
1592                if qpos is not None and refpos is not None and \
1593                   _start <= refpos < _stop:
1594
1595                    # only check base quality if _threshold > 0
1596                    if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
1597                        if seq[qpos] == 'A':
1598                            count_a.data.as_ulongs[refpos - _start] += 1
1599                        if seq[qpos] == 'C':
1600                            count_c.data.as_ulongs[refpos - _start] += 1
1601                        if seq[qpos] == 'G':
1602                            count_g.data.as_ulongs[refpos - _start] += 1
1603                        if seq[qpos] == 'T':
1604                            count_t.data.as_ulongs[refpos - _start] += 1
1605
1606        return count_a, count_c, count_g, count_t
1607
1608    def find_introns_slow(self, read_iterator):
1609        """Return a dictionary {(start, stop): count}
1610        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
1611        and their support ( = number of reads ).
1612
1613        read_iterator can be the result of a .fetch(...) call.
1614        Or it can be a generator filtering such reads. Example
1615        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
1616        """
1617        res = collections.Counter()
1618        for r in read_iterator:
1619            if 'N' in r.cigarstring:
1620                last_read_pos = False
1621                for read_loc, genome_loc in r.get_aligned_pairs():
1622                    if read_loc is None and last_read_pos:
1623                        start = genome_loc
1624                    elif read_loc and last_read_pos is None:
1625                        stop = genome_loc  # we are right exclusive ,so this is correct
1626                        res[(start, stop)] += 1
1627                        del start
1628                        del stop
1629                    last_read_pos = read_loc
1630        return res
1631
1632    def find_introns(self, read_iterator):
1633        """Return a dictionary {(start, stop): count}
1634        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
1635        and their support ( = number of reads ).
1636
1637        read_iterator can be the result of a .fetch(...) call.
1638        Or it can be a generator filtering such reads. Example
1639        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
1640        """
1641        cdef:
1642            uint32_t base_position, junc_start, nt
1643            int op
1644            AlignedSegment r
1645            int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
1646
1647        res = collections.Counter()
1648
1649        match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
1650        for r in read_iterator:
1651            base_position = r.pos
1652
1653            for op, nt in r.cigartuples:
1654                if op in match_or_deletion:
1655                    base_position += nt
1656                elif op == BAM_CREF_SKIP:
1657                    junc_start = base_position
1658                    base_position += nt
1659                    res[(junc_start, base_position)] += 1
1660        return res
1661
1662
1663    def close(self):
1664        '''closes the :class:`pysam.AlignmentFile`.'''
1665
1666        if self.htsfile == NULL:
1667            return
1668
1669        cdef int ret = hts_close(self.htsfile)
1670        self.htsfile = NULL
1671
1672        if self.index != NULL:
1673            hts_idx_destroy(self.index)
1674            self.index = NULL
1675
1676        self.header = None
1677
1678        if ret < 0:
1679            global errno
1680            if errno == EPIPE:
1681                errno = 0
1682            else:
1683                raise IOError(errno, force_str(strerror(errno)))
1684
1685    def __dealloc__(self):
1686        cdef int ret = 0
1687
1688        if self.htsfile != NULL:
1689            ret = hts_close(self.htsfile)
1690            self.htsfile = NULL
1691
1692        if self.index != NULL:
1693            hts_idx_destroy(self.index)
1694            self.index = NULL
1695
1696        self.header = None
1697
1698        if self.b:
1699            bam_destroy1(self.b)
1700            self.b = NULL
1701
1702        if ret < 0:
1703            global errno
1704            if errno == EPIPE:
1705                errno = 0
1706            else:
1707                raise IOError(errno, force_str(strerror(errno)))
1708
1709    cpdef int write(self, AlignedSegment read) except -1:
1710        '''
1711        write a single :class:`pysam.AlignedSegment` to disk.
1712
1713        Raises:
1714            ValueError
1715                if the writing failed
1716
1717        Returns:
1718            int :
1719                the number of bytes written. If the file is closed,
1720                this will be 0.
1721        '''
1722        if not self.is_open:
1723            return 0
1724
1725        if self.header.ptr.n_targets <= read._delegate.core.tid:
1726            raise ValueError(
1727                "AlignedSegment refers to reference number {} that "
1728                "is larger than the number of references ({}) in the header".format(
1729                    read._delegate.core.tid, self.header.ptr.n_targets))
1730
1731        cdef int ret
1732        with nogil:
1733            ret = sam_write1(self.htsfile,
1734                             self.header.ptr,
1735                             read._delegate)
1736
1737        # kbj: Still need to raise an exception with except -1. Otherwise
1738        #      when ret == -1 we get a "SystemError: error return without
1739        #      exception set".
1740        if ret < 0:
1741            raise IOError(
1742            "sam_write1 failed with error code {}".format(ret))
1743
1744        return ret
1745
1746    # context manager interface
1747    def __enter__(self):
1748        return self
1749
1750    def __exit__(self, exc_type, exc_value, traceback):
1751        self.close()
1752        return False
1753
1754    ###############################################################
1755    ###############################################################
1756    ###############################################################
1757    ## properties
1758    ###############################################################
1759    property mapped:
1760        """int with total number of mapped alignments according to the
1761        statistics recorded in the index. This is a read-only
1762        attribute.
1763        (This will be 0 for a CRAM file indexed by a .crai index, as that
1764        index format does not record these statistics.)
1765        """
1766        def __get__(self):
1767            self.check_index()
1768            cdef int tid
1769            cdef uint64_t total = 0
1770            cdef uint64_t mapped, unmapped
1771            for tid from 0 <= tid < self.header.nreferences:
1772                with nogil:
1773                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1774                total += mapped
1775            return total
1776
1777    property unmapped:
1778        """int with total number of unmapped reads according to the statistics
1779        recorded in the index. This number of reads includes the number of reads
1780        without coordinates. This is a read-only attribute.
1781        (This will be 0 for a CRAM file indexed by a .crai index, as that
1782        index format does not record these statistics.)
1783        """
1784        def __get__(self):
1785            self.check_index()
1786            cdef int tid
1787            cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
1788            cdef uint64_t mapped, unmapped
1789            for tid from 0 <= tid < self.header.nreferences:
1790                with nogil:
1791                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1792                total += unmapped
1793            return total
1794
1795    property nocoordinate:
1796        """int with total number of reads without coordinates according to the
1797        statistics recorded in the index, i.e., the statistic printed for "*"
1798        by the ``samtools idxstats`` command. This is a read-only attribute.
1799        (This will be 0 for a CRAM file indexed by a .crai index, as that
1800        index format does not record these statistics.)
1801        """
1802        def __get__(self):
1803            self.check_index()
1804            cdef uint64_t n
1805            with nogil:
1806                n = hts_idx_get_n_no_coor(self.index)
1807            return n
1808
1809    def get_index_statistics(self):
1810        """return statistics about mapped/unmapped reads per chromosome as
1811        they are stored in the index, similarly to the statistics printed
1812        by the ``samtools idxstats`` command.
1813
1814        CRAI indexes do not record these statistics, so for a CRAM file
1815        with a .crai index the returned statistics will all be 0.
1816
1817        Returns:
1818            list :
1819                a list of records for each chromosome. Each record has the
1820                attributes 'contig', 'mapped', 'unmapped' and 'total'.
1821        """
1822
1823        self.check_index()
1824        cdef int tid
1825        cdef uint64_t mapped, unmapped
1826        results = []
1827        # TODO: use header
1828        for tid from 0 <= tid < self.nreferences:
1829            with nogil:
1830                hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1831            results.append(
1832                IndexStats._make((
1833                    self.get_reference_name(tid),
1834                    mapped,
1835                    unmapped,
1836                    mapped + unmapped)))
1837
1838        return results
1839
1840    ###############################################################
1841    ## file-object like iterator access
1842    ## note: concurrent access will cause errors (see IteratorRow
1843    ## and multiple_iterators)
1844    ## Possible solutions: deprecate or open new file handle
1845    def __iter__(self):
1846        if not self.is_open:
1847            raise ValueError("I/O operation on closed file")
1848
1849        if not self.is_bam and self.header.nreferences == 0:
1850            raise NotImplementedError(
1851                "can not iterate over samfile without header")
1852        return self
1853
1854    cdef bam1_t * getCurrent(self):
1855        return self.b
1856
1857    cdef int cnext(self):
1858        '''
1859        cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
1860        '''
1861        cdef int ret
1862        cdef bam_hdr_t * hdr = self.header.ptr
1863        with nogil:
1864            ret = sam_read1(self.htsfile,
1865                            hdr,
1866                            self.b)
1867        return ret
1868
1869    def __next__(self):
1870        cdef int ret = self.cnext()
1871        if ret >= 0:
1872            return makeAlignedSegment(self.b, self.header)
1873        elif ret == -1:
1874            raise StopIteration
1875        else:
1876            raise IOError(read_failure_reason(ret))
1877
1878    ###########################################
1879    # methods/properties referencing the header
1880    def is_valid_tid(self, int tid):
1881        """
1882        return True if the numerical :term:`tid` is valid; False otherwise.
1883
1884        Note that the unmapped tid code (-1) counts as an invalid.
1885        """
1886        if self.header is None:
1887            raise ValueError("header not available in closed files")
1888        return self.header.is_valid_tid(tid)
1889
1890    def get_tid(self, reference):
1891        """
1892        return the numerical :term:`tid` corresponding to
1893        :term:`reference`
1894
1895        returns -1 if reference is not known.
1896        """
1897        if self.header is None:
1898            raise ValueError("header not available in closed files")
1899        return self.header.get_tid(reference)
1900
1901    def get_reference_name(self, tid):
1902        """
1903        return :term:`reference` name corresponding to numerical :term:`tid`
1904        """
1905        if self.header is None:
1906            raise ValueError("header not available in closed files")
1907        return self.header.get_reference_name(tid)
1908
1909    def get_reference_length(self, reference):
1910        """
1911        return :term:`reference` length corresponding to numerical :term:`tid`
1912        """
1913        if self.header is None:
1914            raise ValueError("header not available in closed files")
1915        return self.header.get_reference_length(reference)
1916
1917    property nreferences:
1918        """int with the number of :term:`reference` sequences in the file.
1919        This is a read-only attribute."""
1920        def __get__(self):
1921            if self.header:
1922                return self.header.nreferences
1923            else:
1924                raise ValueError("header not available in closed files")
1925
1926    property references:
1927        """tuple with the names of :term:`reference` sequences. This is a
1928        read-only attribute"""
1929        def __get__(self):
1930            if self.header:
1931                return self.header.references
1932            else:
1933                raise ValueError("header not available in closed files")
1934
1935    property lengths:
1936        """tuple of the lengths of the :term:`reference` sequences. This is a
1937        read-only attribute. The lengths are in the same order as
1938        :attr:`pysam.AlignmentFile.references`
1939
1940        """
1941        def __get__(self):
1942            if self.header:
1943                return self.header.lengths
1944            else:
1945                raise ValueError("header not available in closed files")
1946
1947    # Compatibility functions for pysam < 0.14
1948    property text:
1949        """deprecated, use .header directly"""
1950        def __get__(self):
1951            if self.header:
1952                return self.header.__str__()
1953            else:
1954                raise ValueError("header not available in closed files")
1955
1956    # Compatibility functions for pysam < 0.8.3
1957    def gettid(self, reference):
1958        """deprecated, use get_tid() instead"""
1959        return self.get_tid(reference)
1960
1961    def getrname(self, tid):
1962        """deprecated, use get_reference_name() instead"""
1963        return self.get_reference_name(tid)
1964
1965
1966cdef class IteratorRow:
1967    '''abstract base class for iterators over mapped reads.
1968
1969    Various iterators implement different behaviours for wrapping around
1970    contig boundaries. Examples include:
1971
1972    :class:`pysam.IteratorRowRegion`
1973        iterate within a single contig and a defined region.
1974
1975    :class:`pysam.IteratorRowAll`
1976        iterate until EOF. This iterator will also include unmapped reads.
1977
1978    :class:`pysam.IteratorRowAllRefs`
1979        iterate over all reads in all reference sequences.
1980
1981    The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
1982
1983    .. note::
1984
1985        It is usually not necessary to create an object of this class
1986        explicitly. It is returned as a result of call to a
1987        :meth:`AlignmentFile.fetch`.
1988
1989    '''
1990
1991    def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
1992        cdef char *cfilename
1993        cdef char *creference_filename
1994        cdef char *cindexname = NULL
1995
1996        if not samfile.is_open:
1997            raise ValueError("I/O operation on closed file")
1998
1999        # makes sure that samfile stays alive as long as the
2000        # iterator is alive
2001        self.samfile = samfile
2002
2003        # reopen the file - note that this makes the iterator
2004        # slow and causes pileup to slow down significantly.
2005        if multiple_iterators:
2006
2007            cfilename = samfile.filename
2008            with nogil:
2009                self.htsfile = hts_open(cfilename, 'r')
2010            assert self.htsfile != NULL
2011
2012            if samfile.has_index():
2013                if samfile.index_filename:
2014                    cindexname = bindex_filename = encode_filename(samfile.index_filename)
2015                with nogil:
2016                    self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
2017            else:
2018                self.index = NULL
2019
2020            # need to advance in newly opened file to position after header
2021            # better: use seek/tell?
2022            with nogil:
2023                hdr = sam_hdr_read(self.htsfile)
2024            if hdr is NULL:
2025                raise IOError("unable to read header information")
2026            self.header = makeAlignmentHeader(hdr)
2027
2028            self.owns_samfile = True
2029
2030            # options specific to CRAM files
2031            if samfile.is_cram and samfile.reference_filename:
2032                creference_filename = samfile.reference_filename
2033                hts_set_opt(self.htsfile,
2034                            CRAM_OPT_REFERENCE,
2035                            creference_filename)
2036
2037        else:
2038            self.htsfile = samfile.htsfile
2039            self.index = samfile.index
2040            self.owns_samfile = False
2041            self.header = samfile.header
2042
2043        self.retval = 0
2044
2045        self.b = bam_init1()
2046
2047    def __dealloc__(self):
2048        bam_destroy1(self.b)
2049        if self.owns_samfile:
2050            hts_close(self.htsfile)
2051            hts_idx_destroy(self.index)
2052
2053
2054cdef class IteratorRowRegion(IteratorRow):
2055    """*(AlignmentFile samfile, int tid, int beg, int stop,
2056    int multiple_iterators=False)*
2057
2058    iterate over mapped reads in a region.
2059
2060    .. note::
2061
2062        It is usually not necessary to create an object of this class
2063        explicitly. It is returned as a result of call to a
2064        :meth:`AlignmentFile.fetch`.
2065
2066    """
2067
2068    def __init__(self, AlignmentFile samfile,
2069                 int tid, int beg, int stop,
2070                 int multiple_iterators=False):
2071
2072        if not samfile.has_index():
2073            raise ValueError("no index available for iteration")
2074
2075        IteratorRow.__init__(self, samfile,
2076                             multiple_iterators=multiple_iterators)
2077        with nogil:
2078            self.iter = sam_itr_queryi(
2079                self.index,
2080                tid,
2081                beg,
2082                stop)
2083
2084    def __iter__(self):
2085        return self
2086
2087    cdef bam1_t * getCurrent(self):
2088        return self.b
2089
2090    cdef int cnext(self):
2091        '''cversion of iterator. Used by IteratorColumn'''
2092        with nogil:
2093            self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
2094                                       self.iter,
2095                                       self.b,
2096                                       self.htsfile)
2097
2098    def __next__(self):
2099        self.cnext()
2100        if self.retval >= 0:
2101            return makeAlignedSegment(self.b, self.header)
2102        elif self.retval == -1:
2103            raise StopIteration
2104        elif self.retval == -2:
2105            # Note: it is currently not the case that hts_iter_next
2106            # returns -2 for a truncated file.
2107            # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
2108            raise IOError('truncated file')
2109        else:
2110            raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
2111
2112    def __dealloc__(self):
2113        hts_itr_destroy(self.iter)
2114
2115
2116cdef class IteratorRowHead(IteratorRow):
2117    """*(AlignmentFile samfile, n, int multiple_iterators=False)*
2118
2119    iterate over first n reads in `samfile`
2120
2121    .. note::
2122        It is usually not necessary to create an object of this class
2123        explicitly. It is returned as a result of call to a
2124        :meth:`AlignmentFile.head`.
2125
2126    """
2127
2128    def __init__(self,
2129                 AlignmentFile samfile,
2130                 int n,
2131                 int multiple_iterators=False):
2132
2133        IteratorRow.__init__(self, samfile,
2134                             multiple_iterators=multiple_iterators)
2135
2136        self.max_rows = n
2137        self.current_row = 0
2138
2139    def __iter__(self):
2140        return self
2141
2142    cdef bam1_t * getCurrent(self):
2143        return self.b
2144
2145    cdef int cnext(self):
2146        '''cversion of iterator. Used by IteratorColumn'''
2147        cdef int ret
2148        cdef bam_hdr_t * hdr = self.header.ptr
2149        with nogil:
2150            ret = sam_read1(self.htsfile,
2151                            hdr,
2152                            self.b)
2153        return ret
2154
2155    def __next__(self):
2156        if self.current_row >= self.max_rows:
2157            raise StopIteration
2158
2159        cdef int ret = self.cnext()
2160        if ret >= 0:
2161            self.current_row += 1
2162            return makeAlignedSegment(self.b, self.header)
2163        elif ret == -1:
2164            raise StopIteration
2165        else:
2166            raise IOError(read_failure_reason(ret))
2167
2168
2169cdef class IteratorRowAll(IteratorRow):
2170    """*(AlignmentFile samfile, int multiple_iterators=False)*
2171
2172    iterate over all reads in `samfile`
2173
2174    .. note::
2175
2176        It is usually not necessary to create an object of this class
2177        explicitly. It is returned as a result of call to a
2178        :meth:`AlignmentFile.fetch`.
2179
2180    """
2181
2182    def __init__(self, AlignmentFile samfile,
2183                 int multiple_iterators=False):
2184
2185        IteratorRow.__init__(self, samfile,
2186                             multiple_iterators=multiple_iterators)
2187
2188    def __iter__(self):
2189        return self
2190
2191    cdef bam1_t * getCurrent(self):
2192        return self.b
2193
2194    cdef int cnext(self):
2195        '''cversion of iterator. Used by IteratorColumn'''
2196        cdef int ret
2197        cdef bam_hdr_t * hdr = self.header.ptr
2198        with nogil:
2199            ret = sam_read1(self.htsfile,
2200                            hdr,
2201                            self.b)
2202        return ret
2203
2204    def __next__(self):
2205        cdef int ret = self.cnext()
2206        if ret >= 0:
2207            return makeAlignedSegment(self.b, self.header)
2208        elif ret == -1:
2209            raise StopIteration
2210        else:
2211            raise IOError(read_failure_reason(ret))
2212
2213
2214cdef class IteratorRowAllRefs(IteratorRow):
2215    """iterates over all mapped reads by chaining iterators over each
2216    reference
2217
2218    .. note::
2219        It is usually not necessary to create an object of this class
2220        explicitly. It is returned as a result of call to a
2221        :meth:`AlignmentFile.fetch`.
2222
2223    """
2224
2225    def __init__(self, AlignmentFile samfile,
2226                 multiple_iterators=False):
2227
2228        IteratorRow.__init__(self, samfile,
2229                             multiple_iterators=multiple_iterators)
2230
2231        if not samfile.has_index():
2232            raise ValueError("no index available for fetch")
2233
2234        self.tid = -1
2235
2236    def nextiter(self):
2237        # get a new iterator for a chromosome. The file
2238        # will not be re-opened.
2239        self.rowiter = IteratorRowRegion(self.samfile,
2240                                         self.tid,
2241                                         0,
2242                                         MAX_POS)
2243        # set htsfile and header of the rowiter
2244        # to the values in this iterator to reflect multiple_iterators
2245        self.rowiter.htsfile = self.htsfile
2246        self.rowiter.header = self.header
2247
2248        # make sure the iterator understand that IteratorRowAllRefs
2249        # has ownership
2250        self.rowiter.owns_samfile = False
2251
2252    def __iter__(self):
2253        return self
2254
2255    def __next__(self):
2256        # Create an initial iterator
2257        if self.tid == -1:
2258            if not self.samfile.nreferences:
2259                raise StopIteration
2260            self.tid = 0
2261            self.nextiter()
2262
2263        while 1:
2264            self.rowiter.cnext()
2265
2266            # If current iterator is not exhausted, return aligned read
2267            if self.rowiter.retval > 0:
2268                return makeAlignedSegment(self.rowiter.b, self.header)
2269
2270            self.tid += 1
2271
2272            # Otherwise, proceed to next reference or stop
2273            if self.tid < self.samfile.nreferences:
2274                self.nextiter()
2275            else:
2276                raise StopIteration
2277
2278
2279cdef class IteratorRowSelection(IteratorRow):
2280    """*(AlignmentFile samfile)*
2281
2282    iterate over reads in `samfile` at a given list of file positions.
2283
2284    .. note::
2285        It is usually not necessary to create an object of this class
2286        explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
2287    """
2288
2289    def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
2290
2291        IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
2292
2293        self.positions = positions
2294        self.current_pos = 0
2295
2296    def __iter__(self):
2297        return self
2298
2299    cdef bam1_t * getCurrent(self):
2300        return self.b
2301
2302    cdef int cnext(self):
2303        '''cversion of iterator'''
2304        # end iteration if out of positions
2305        if self.current_pos >= len(self.positions): return -1
2306
2307        cdef uint64_t pos = self.positions[self.current_pos]
2308        with nogil:
2309            bgzf_seek(hts_get_bgzfp(self.htsfile),
2310                      pos,
2311                      0)
2312        self.current_pos += 1
2313
2314        cdef int ret
2315        cdef bam_hdr_t * hdr = self.header.ptr
2316        with nogil:
2317            ret = sam_read1(self.htsfile,
2318                            hdr,
2319                            self.b)
2320        return ret
2321
2322    def __next__(self):
2323        cdef int ret = self.cnext()
2324        if ret >= 0:
2325            return makeAlignedSegment(self.b, self.header)
2326        elif ret == -1:
2327            raise StopIteration
2328        else:
2329            raise IOError(read_failure_reason(ret))
2330
2331
2332cdef int __advance_nofilter(void *data, bam1_t *b):
2333    '''advance without any read filtering.
2334    '''
2335    cdef __iterdata * d = <__iterdata*>data
2336    cdef int ret
2337    with nogil:
2338        ret = sam_itr_next(d.htsfile, d.iter, b)
2339    return ret
2340
2341
2342cdef int __advance_raw_nofilter(void *data, bam1_t *b):
2343    '''advance (without iterator) without any read filtering.
2344    '''
2345    cdef __iterdata * d = <__iterdata*>data
2346    cdef int ret
2347    with nogil:
2348        ret = sam_read1(d.htsfile, d.header, b)
2349    return ret
2350
2351
2352cdef int __advance_all(void *data, bam1_t *b):
2353    '''only use reads for pileup passing basic filters such as
2354
2355    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
2356    '''
2357
2358    cdef __iterdata * d = <__iterdata*>data
2359    cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
2360    cdef int ret
2361    while 1:
2362        with nogil:
2363            ret = sam_itr_next(d.htsfile, d.iter, b)
2364        if ret < 0:
2365            break
2366        if b.core.flag & d.flag_filter:
2367            continue
2368        break
2369    return ret
2370
2371
2372cdef int __advance_raw_all(void *data, bam1_t *b):
2373    '''only use reads for pileup passing basic filters such as
2374
2375    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
2376    '''
2377
2378    cdef __iterdata * d = <__iterdata*>data
2379    cdef int ret
2380    while 1:
2381        with nogil:
2382            ret = sam_read1(d.htsfile, d.header, b)
2383        if ret < 0:
2384            break
2385        if b.core.flag & d.flag_filter:
2386            continue
2387        break
2388    return ret
2389
2390
2391cdef int __advance_samtools(void * data, bam1_t * b):
2392    '''advance using same filter and read processing as in
2393    the samtools pileup.
2394    '''
2395    cdef __iterdata * d = <__iterdata*>data
2396    cdef int ret
2397    cdef int q
2398
2399    while 1:
2400        with nogil:
2401            ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
2402        if ret < 0:
2403            break
2404        if b.core.flag & d.flag_filter:
2405            continue
2406        if d.flag_require and not (b.core.flag & d.flag_require):
2407            continue
2408
2409        # reload sequence
2410        if d.fastafile != NULL and b.core.tid != d.tid:
2411            if d.seq != NULL:
2412                free(d.seq)
2413            d.tid = b.core.tid
2414            with nogil:
2415                d.seq = faidx_fetch_seq(
2416                    d.fastafile,
2417                    d.header.target_name[d.tid],
2418                    0, MAX_POS,
2419                    &d.seq_len)
2420
2421            if d.seq == NULL:
2422                raise ValueError(
2423                    "reference sequence for '{}' (tid={}) not found".format(
2424                        d.header.target_name[d.tid], d.tid))
2425
2426        # realign read - changes base qualities
2427        if d.seq != NULL and d.compute_baq:
2428            # 4th option to realign is flag:
2429            # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
2430            if d.redo_baq:
2431                sam_prob_realn(b, d.seq, d.seq_len, 7)
2432            else:
2433                sam_prob_realn(b, d.seq, d.seq_len, 3)
2434
2435        if d.seq != NULL and d.adjust_capq_threshold > 10:
2436            q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
2437            if q < 0:
2438                continue
2439            elif b.core.qual > q:
2440                b.core.qual = q
2441
2442        if b.core.qual < d.min_mapping_quality:
2443            continue
2444        if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
2445            continue
2446
2447        break
2448
2449    return ret
2450
2451
2452cdef class IteratorColumn:
2453    '''abstract base class for iterators over columns.
2454
2455    IteratorColumn objects wrap the pileup functionality of samtools.
2456
2457    For reasons of efficiency, the iterator points to the current
2458    pileup buffer. The pileup buffer is updated at every iteration.
2459    This might cause some unexpected behaviour. For example,
2460    consider the conversion to a list::
2461
2462       f = AlignmentFile("file.bam", "rb")
2463       result = list(f.pileup())
2464
2465    Here, ``result`` will contain ``n`` objects of type
2466    :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
2467    ``result`` will contain the same information.
2468
2469    The desired behaviour can be achieved by list comprehension::
2470
2471       result = [x.pileups() for x in f.pileup()]
2472
2473    ``result`` will be a list of ``n`` lists of objects of type
2474    :class:`~pysam.PileupRead`.
2475
2476    If the iterator is associated with a :class:`~pysam.Fastafile`
2477    using the :meth:`add_reference` method, then the iterator will
2478    export the current sequence via the methods :meth:`get_sequence`
2479    and :meth:`seq_len`.
2480
2481    See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
2482    '''
2483
2484    def __cinit__( self, AlignmentFile samfile, **kwargs):
2485        self.samfile = samfile
2486        self.fastafile = kwargs.get("fastafile", None)
2487        self.stepper = kwargs.get("stepper", "samtools")
2488        self.max_depth = kwargs.get("max_depth", 8000)
2489        self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
2490        self.min_base_quality = kwargs.get("min_base_quality", 13)
2491        self.iterdata.seq = NULL
2492        self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
2493        self.iterdata.flag_require = kwargs.get("flag_require", 0)
2494        self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
2495        self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
2496        self.iterdata.compute_baq = kwargs.get("compute_baq", True)
2497        self.iterdata.redo_baq = kwargs.get("redo_baq", False)
2498        self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
2499
2500        self.tid = 0
2501        self.pos = 0
2502        self.n_plp = 0
2503        self.plp = NULL
2504        self.pileup_iter = <bam_mplp_t>NULL
2505
2506    def __iter__(self):
2507        return self
2508
2509    cdef int cnext(self):
2510        '''perform next iteration.
2511        '''
2512        # do not release gil here because of call-backs
2513        cdef int ret = bam_mplp_auto(self.pileup_iter,
2514                                     &self.tid,
2515                                     &self.pos,
2516                                     &self.n_plp,
2517                                     &self.plp)
2518        return ret
2519
2520    cdef char * get_sequence(self):
2521        '''return current reference sequence underlying the iterator.
2522        '''
2523        return self.iterdata.seq
2524
2525    property seq_len:
2526        '''current sequence length.'''
2527        def __get__(self):
2528            return self.iterdata.seq_len
2529
2530    def add_reference(self, FastaFile fastafile):
2531       '''
2532       add reference sequences in `fastafile` to iterator.'''
2533       self.fastafile = fastafile
2534       if self.iterdata.seq != NULL:
2535           free(self.iterdata.seq)
2536       self.iterdata.tid = -1
2537       self.iterdata.fastafile = self.fastafile.fastafile
2538
2539    def has_reference(self):
2540        '''
2541        return true if iterator is associated with a reference'''
2542        return self.fastafile
2543
2544    cdef _setup_iterator(self,
2545                         int tid,
2546                         int start,
2547                         int stop,
2548                         int multiple_iterators=0):
2549        '''setup the iterator structure'''
2550
2551        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
2552        self.iterdata.htsfile = self.samfile.htsfile
2553        self.iterdata.iter = self.iter.iter
2554        self.iterdata.seq = NULL
2555        self.iterdata.tid = -1
2556        self.iterdata.header = self.samfile.header.ptr
2557
2558        if self.fastafile is not None:
2559            self.iterdata.fastafile = self.fastafile.fastafile
2560        else:
2561            self.iterdata.fastafile = NULL
2562
2563        # Free any previously allocated memory before reassigning
2564        # pileup_iter
2565        self._free_pileup_iter()
2566
2567        cdef void * data[1]
2568        data[0] = <void*>&self.iterdata
2569
2570        if self.stepper is None or self.stepper == "all":
2571            with nogil:
2572                self.pileup_iter = bam_mplp_init(1,
2573                                                 <bam_plp_auto_f>&__advance_all,
2574                                                 data)
2575        elif self.stepper == "nofilter":
2576            with nogil:
2577                self.pileup_iter = bam_mplp_init(1,
2578                                                 <bam_plp_auto_f>&__advance_nofilter,
2579                                                 data)
2580        elif self.stepper == "samtools":
2581            with nogil:
2582                self.pileup_iter = bam_mplp_init(1,
2583                                                 <bam_plp_auto_f>&__advance_samtools,
2584                                                 data)
2585        else:
2586            raise ValueError(
2587                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
2588
2589        if self.max_depth:
2590            with nogil:
2591                bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
2592
2593        if self.ignore_overlaps:
2594            with nogil:
2595                bam_mplp_init_overlaps(self.pileup_iter)
2596
2597    cdef _setup_raw_rest_iterator(self):
2598        '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
2599
2600        self.iter = None
2601        self.iterdata.iter = NULL
2602        self.iterdata.htsfile = self.samfile.htsfile
2603        self.iterdata.seq = NULL
2604        self.iterdata.tid = -1
2605        self.iterdata.header = self.samfile.header.ptr
2606
2607        if self.fastafile is not None:
2608            self.iterdata.fastafile = self.fastafile.fastafile
2609        else:
2610            self.iterdata.fastafile = NULL
2611
2612        # Free any previously allocated memory before reassigning
2613        # pileup_iter
2614        self._free_pileup_iter()
2615
2616        cdef void * data[1]
2617        data[0] = <void*>&self.iterdata
2618
2619        if self.stepper is None or self.stepper == "all":
2620            with nogil:
2621                self.pileup_iter = bam_mplp_init(1,
2622                                                 <bam_plp_auto_f>&__advance_raw_all,
2623                                                 data)
2624        elif self.stepper == "nofilter":
2625            with nogil:
2626                self.pileup_iter = bam_mplp_init(1,
2627                                                 <bam_plp_auto_f>&__advance_raw_nofilter,
2628                                                 data)
2629        elif self.stepper == "samtools":
2630            with nogil:
2631                self.pileup_iter = bam_mplp_init(1,
2632                                                 <bam_plp_auto_f>&__advance_samtools,
2633                                                 data)
2634        else:
2635            raise ValueError(
2636                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
2637
2638        if self.max_depth:
2639            with nogil:
2640                bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
2641
2642        if self.ignore_overlaps:
2643            with nogil:
2644                bam_mplp_init_overlaps(self.pileup_iter)
2645
2646    cdef reset(self, tid, start, stop):
2647        '''reset iterator position.
2648
2649        This permits using the iterator multiple times without
2650        having to incur the full set-up costs.
2651        '''
2652        if self.iter is None:
2653            raise TypeError("Raw iterator set up without region cannot be reset")
2654
2655        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
2656        self.iterdata.iter = self.iter.iter
2657
2658        # invalidate sequence if different tid
2659        if self.tid != tid:
2660            if self.iterdata.seq != NULL:
2661                free(self.iterdata.seq)
2662            self.iterdata.seq = NULL
2663            self.iterdata.tid = -1
2664
2665        # self.pileup_iter = bam_mplp_init(1
2666        #                                  &__advancepileup,
2667        #                                  &self.iterdata)
2668        with nogil:
2669            bam_mplp_reset(self.pileup_iter)
2670
2671    cdef _free_pileup_iter(self):
2672        '''free the memory alloc'd by bam_plp_init.
2673
2674        This is needed before setup_iterator allocates another
2675        pileup_iter, or else memory will be lost.  '''
2676        if self.pileup_iter != <bam_mplp_t>NULL:
2677            with nogil:
2678                bam_mplp_reset(self.pileup_iter)
2679                bam_mplp_destroy(self.pileup_iter)
2680                self.pileup_iter = <bam_mplp_t>NULL
2681
2682    def __dealloc__(self):
2683        # reset in order to avoid memory leak messages for iterators
2684        # that have not been fully consumed
2685        self._free_pileup_iter()
2686        self.plp = <const bam_pileup1_t*>NULL
2687
2688        if self.iterdata.seq != NULL:
2689            free(self.iterdata.seq)
2690            self.iterdata.seq = NULL
2691
2692    # backwards compatibility
2693
2694    def hasReference(self):
2695        return self.has_reference()
2696    cdef char * getSequence(self):
2697        return self.get_sequence()
2698    def addReference(self, FastaFile fastafile):
2699        return self.add_reference(fastafile)
2700
2701
2702cdef class IteratorColumnRegion(IteratorColumn):
2703    '''iterates over a region only.
2704    '''
2705    def __cinit__(self,
2706                  AlignmentFile samfile,
2707                  int tid = 0,
2708                  int start = 0,
2709                  int stop = MAX_POS,
2710                  int truncate = False,
2711                  int multiple_iterators = True,
2712                  **kwargs ):
2713
2714        # initialize iterator. Multiple iterators not available
2715        # for CRAM.
2716        if multiple_iterators and samfile.is_cram:
2717            warnings.warn("multiple_iterators not implemented for CRAM")
2718            multiple_iterators = False
2719
2720        self._setup_iterator(tid, start, stop, multiple_iterators)
2721        self.start = start
2722        self.stop = stop
2723        self.truncate = truncate
2724
2725    def __next__(self):
2726
2727        cdef int n
2728
2729        while 1:
2730            n = self.cnext()
2731            if n < 0:
2732                raise ValueError("error during iteration" )
2733
2734            if n == 0:
2735                raise StopIteration
2736
2737            if self.truncate:
2738                if self.start > self.pos:
2739                    continue
2740                if self.pos >= self.stop:
2741                    raise StopIteration
2742
2743            return makePileupColumn(&self.plp,
2744                                    self.tid,
2745                                    self.pos,
2746                                    self.n_plp,
2747                                    self.min_base_quality,
2748                                    self.iterdata.seq,
2749                                    self.samfile.header)
2750
2751
2752cdef class IteratorColumnAllRefs(IteratorColumn):
2753    """iterates over all columns by chaining iterators over each reference
2754    """
2755
2756    def __cinit__(self,
2757                  AlignmentFile samfile,
2758                  **kwargs):
2759
2760        # no iteration over empty files
2761        if not samfile.nreferences:
2762            raise StopIteration
2763
2764        # initialize iterator
2765        self._setup_iterator(self.tid, 0, MAX_POS, 1)
2766
2767    def __next__(self):
2768
2769        cdef int n
2770        while 1:
2771            n = self.cnext()
2772            if n < 0:
2773                raise ValueError("error during iteration")
2774
2775            # proceed to next reference or stop
2776            if n == 0:
2777                self.tid += 1
2778                if self.tid < self.samfile.nreferences:
2779                    self._setup_iterator(self.tid, 0, MAX_POS, 0)
2780                else:
2781                    raise StopIteration
2782                continue
2783
2784            # return result, if within same reference
2785            return makePileupColumn(&self.plp,
2786                                    self.tid,
2787                                    self.pos,
2788                                    self.n_plp,
2789                                    self.min_base_quality,
2790                                    self.iterdata.seq,
2791                                    self.samfile.header)
2792
2793
2794cdef class IteratorColumnAll(IteratorColumn):
2795    """iterates over all columns, without using an index
2796    """
2797
2798    def __cinit__(self,
2799                  AlignmentFile samfile,
2800                  **kwargs):
2801
2802        self._setup_raw_rest_iterator()
2803
2804    def __next__(self):
2805
2806        cdef int n
2807        n = self.cnext()
2808        if n < 0:
2809            raise ValueError("error during iteration")
2810
2811        if n == 0:
2812            raise StopIteration
2813
2814        return makePileupColumn(&self.plp,
2815                                self.tid,
2816                                self.pos,
2817                                self.n_plp,
2818                                self.min_base_quality,
2819                                self.iterdata.seq,
2820                                self.samfile.header)
2821
2822
2823cdef class SNPCall:
2824    '''the results of a SNP call.'''
2825    cdef int _tid
2826    cdef int _pos
2827    cdef char _reference_base
2828    cdef char _genotype
2829    cdef int _consensus_quality
2830    cdef int _snp_quality
2831    cdef int _rms_mapping_quality
2832    cdef int _coverage
2833
2834    property tid:
2835        '''the chromosome ID as is defined in the header'''
2836        def __get__(self):
2837            return self._tid
2838
2839    property pos:
2840       '''nucleotide position of SNP.'''
2841       def __get__(self): return self._pos
2842
2843    property reference_base:
2844       '''reference base at pos. ``N`` if no reference sequence supplied.'''
2845       def __get__(self): return from_string_and_size( &self._reference_base, 1 )
2846
2847    property genotype:
2848       '''the genotype called.'''
2849       def __get__(self): return from_string_and_size( &self._genotype, 1 )
2850
2851    property consensus_quality:
2852       '''the genotype quality (Phred-scaled).'''
2853       def __get__(self): return self._consensus_quality
2854
2855    property snp_quality:
2856       '''the snp quality (Phred scaled) - probability of consensus being
2857       identical to reference sequence.'''
2858       def __get__(self): return self._snp_quality
2859
2860    property mapping_quality:
2861       '''the root mean square (rms) of the mapping quality of all reads
2862       involved in the call.'''
2863       def __get__(self): return self._rms_mapping_quality
2864
2865    property coverage:
2866       '''coverage or read depth - the number of reads involved in the call.'''
2867       def __get__(self): return self._coverage
2868
2869    def __str__(self):
2870
2871        return "\t".join( map(str, (
2872                    self.tid,
2873                    self.pos,
2874                    self.reference_base,
2875                    self.genotype,
2876                    self.consensus_quality,
2877                    self.snp_quality,
2878                    self.mapping_quality,
2879                    self.coverage ) ) )
2880
2881
2882cdef class IndexedReads:
2883    """Index a Sam/BAM-file by query name while keeping the
2884    original sort order intact.
2885
2886    The index is kept in memory and can be substantial.
2887
2888    By default, the file is re-openend to avoid conflicts if multiple
2889    operators work on the same file. Set `multiple_iterators` = False
2890    to not re-open `samfile`.
2891
2892    Parameters
2893    ----------
2894
2895    samfile : AlignmentFile
2896        File to be indexed.
2897
2898    multiple_iterators : bool
2899        Flag indicating whether the file should be reopened. Reopening prevents
2900        existing iterators being affected by the indexing.
2901
2902    """
2903
2904    def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
2905        cdef char *cfilename
2906
2907        # makes sure that samfile stays alive as long as this
2908        # object is alive.
2909        self.samfile = samfile
2910        cdef bam_hdr_t * hdr = NULL
2911        assert samfile.is_bam, "can only apply IndexReads on bam files"
2912
2913        # multiple_iterators the file - note that this makes the iterator
2914        # slow and causes pileup to slow down significantly.
2915        if multiple_iterators:
2916            cfilename = samfile.filename
2917            with nogil:
2918                self.htsfile = hts_open(cfilename, 'r')
2919            if self.htsfile == NULL:
2920                raise OSError("unable to reopen htsfile")
2921
2922            # need to advance in newly opened file to position after header
2923            # better: use seek/tell?
2924            with nogil:
2925                hdr = sam_hdr_read(self.htsfile)
2926            if hdr == NULL:
2927                raise OSError("unable to read header information")
2928            self.header = makeAlignmentHeader(hdr)
2929            self.owns_samfile = True
2930        else:
2931            self.htsfile = self.samfile.htsfile
2932            self.header = samfile.header
2933            self.owns_samfile = False
2934
2935    def build(self):
2936        '''build the index.'''
2937
2938        self.index = collections.defaultdict(list)
2939
2940        # this method will start indexing from the current file position
2941        cdef int ret = 1
2942        cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
2943        if b == NULL:
2944            raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
2945
2946        cdef uint64_t pos
2947        cdef bam_hdr_t * hdr = self.header.ptr
2948
2949        while ret > 0:
2950            with nogil:
2951                pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
2952                ret = sam_read1(self.htsfile,
2953                                hdr,
2954                                b)
2955
2956            if ret > 0:
2957                qname = charptr_to_str(pysam_bam_get_qname(b))
2958                self.index[qname].append(pos)
2959
2960        bam_destroy1(b)
2961
2962    def find(self, query_name):
2963        '''find `query_name` in index.
2964
2965        Returns
2966        -------
2967
2968        IteratorRowSelection
2969            Returns an iterator over all reads with query_name.
2970
2971        Raises
2972        ------
2973
2974        KeyError
2975            if the `query_name` is not in the index.
2976
2977        '''
2978        if query_name in self.index:
2979            return IteratorRowSelection(
2980                self.samfile,
2981                self.index[query_name],
2982                multiple_iterators = False)
2983        else:
2984            raise KeyError("read %s not found" % query_name)
2985
2986    def __dealloc__(self):
2987        if self.owns_samfile:
2988            hts_close(self.htsfile)
2989