1# cython: embedsignature=True 2# cython: profile=True 3######################################################## 4######################################################## 5# Cython wrapper for SAM/BAM/CRAM files based on htslib 6######################################################## 7# The principal classes defined in this module are: 8# 9# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files 10# 11# class AlignmentHeader manage SAM/BAM/CRAM header data 12# 13# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping 14# the original sort order intact 15# 16# Additionally this module defines numerous additional classes that 17# are part of the internal API. These are: 18# 19# Various iterator classes to iterate over alignments in sequential 20# (IteratorRow) or in a stacked fashion (IteratorColumn): 21# 22# class IteratorRow 23# class IteratorRowRegion 24# class IteratorRowHead 25# class IteratorRowAll 26# class IteratorRowAllRefs 27# class IteratorRowSelection 28# class IteratorColumn 29# class IteratorColumnRegion 30# class IteratorColumnAll 31# class IteratorColumnAllRefs 32# 33######################################################## 34# 35# The MIT License 36# 37# Copyright (c) 2015 Andreas Heger 38# 39# Permission is hereby granted, free of charge, to any person obtaining a 40# copy of this software and associated documentation files (the "Software"), 41# to deal in the Software without restriction, including without limitation 42# the rights to use, copy, modify, merge, publish, distribute, sublicense, 43# and/or sell copies of the Software, and to permit persons to whom the 44# Software is furnished to do so, subject to the following conditions: 45# 46# The above copyright notice and this permission notice shall be included in 47# all copies or substantial portions of the Software. 48# 49# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 50# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 51# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 52# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 53# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 54# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 55# DEALINGS IN THE SOFTWARE. 56# 57######################################################## 58import os 59import collections 60try: 61 from collections.abc import Sequence, Mapping # noqa 62except ImportError: 63 from collections import Sequence, Mapping # noqa 64import re 65import warnings 66import array 67from libc.errno cimport errno, EPIPE 68from libc.string cimport strcmp, strpbrk, strerror 69from libc.stdint cimport INT32_MAX 70 71from cpython cimport array as c_array 72from cpython.version cimport PY_MAJOR_VERSION 73 74from pysam.libcutils cimport force_bytes, force_str, charptr_to_str 75from pysam.libcutils cimport encode_filename, from_string_and_size 76from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn 77from pysam.libchtslib cimport HTSFile, hisremote 78 79if PY_MAJOR_VERSION >= 3: 80 from io import StringIO 81else: 82 from StringIO import StringIO 83 84cimport cython 85 86 87__all__ = [ 88 "AlignmentFile", 89 "AlignmentHeader", 90 "IteratorRow", 91 "IteratorColumn", 92 "IndexedReads"] 93 94IndexStats = collections.namedtuple("IndexStats", 95 ("contig", 96 "mapped", 97 "unmapped", 98 "total")) 99 100######################################################## 101## global variables 102# maximum genomic coordinace 103# for some reason, using 'int' causes overflow 104cdef int MAX_POS = (1 << 31) - 1 105 106# valid types for SAM headers 107VALID_HEADER_TYPES = {"HD" : Mapping, 108 "SQ" : Sequence, 109 "RG" : Sequence, 110 "PG" : Sequence, 111 "CO" : Sequence} 112 113# order of records within SAM headers 114VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") 115 116# default type conversions within SAM header records 117KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, 118 "SQ" : {"SN" : str, "LN" : int, "AS" : str, 119 "M5" : str, "SP" : str, "UR" : str, 120 "AH" : str,}, 121 "RG" : {"ID" : str, "CN" : str, "DS" : str, 122 "DT" : str, "FO" : str, "KS" : str, 123 "LB" : str, "PG" : str, "PI" : str, 124 "PL" : str, "PM" : str, "PU" : str, 125 "SM" : str,}, 126 "PG" : {"ID" : str, "PN" : str, "CL" : str, 127 "PP" : str, "DS" : str, "VN" : str,},} 128 129# output order of fields within records. Ensure that CL is at 130# the end as parsing a CL will ignore any subsequent records. 131VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), 132 "SQ" : ("SN", "LN", "AS", "M5", 133 "UR", "SP", "AH"), 134 "RG" : ("ID", "CN", "SM", "LB", 135 "PU", "PI", "DT", "DS", 136 "PL", "FO", "KS", "PG", 137 "PM"), 138 "PG" : ("PN", "ID", "VN", "PP", 139 "DS", "CL"),} 140 141 142def build_header_line(fields, record): 143 '''build a header line from `fields` dictionary for `record`''' 144 145 # TODO: add checking for field and sort order 146 line = ["@%s" % record] 147 # comment 148 if record == "CO": 149 line.append(fields) 150 # user tags 151 elif record.islower(): 152 for key in sorted(fields): 153 line.append("%s:%s" % (key, str(fields[key]))) 154 # defined tags 155 else: 156 # write fields of the specification 157 for key in VALID_HEADER_ORDER[record]: 158 if key in fields: 159 line.append("%s:%s" % (key, str(fields[key]))) 160 # write user fields 161 for key in fields: 162 if not key.isupper(): 163 line.append("%s:%s" % (key, str(fields[key]))) 164 165 return "\t".join(line) 166 167 168cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): 169 if not hdr: 170 raise ValueError('cannot create AlignmentHeader, received NULL pointer') 171 172 # check: is AlignmetHeader.__cinit__ called? 173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) 174 header.ptr = hdr 175 176 return header 177 178def read_failure_reason(code): 179 if code == -2: 180 return 'truncated file' 181 else: 182 return "error {} while reading file".format(code) 183 184 185# the following should be class-method for VariantHeader, but cdef @classmethods 186# are not implemented in cython. 187cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, 188 reference_names, 189 reference_lengths, 190 add_sq_text=True, 191 text=None) except -1: 192 """build header from list of reference names and lengths. 193 """ 194 195cdef class AlignmentHeader(object): 196 """header information for a :class:`AlignmentFile` object 197 198 Parameters 199 ---------- 200 header_dict : dict 201 build header from a multi-level dictionary. The 202 first level are the four types ('HD', 'SQ', ...). The second 203 level are a list of lines, with each line being a list of 204 tag-value pairs. The header is constructed first from all the 205 defined fields, followed by user tags in alphabetical 206 order. Alternatively, an :class:`~pysam.AlignmentHeader` 207 object can be passed directly. 208 209 text : string 210 use the string provided as the header 211 212 reference_names : list 213 see reference_lengths 214 215 reference_lengths : list 216 build header from list of chromosome names and lengths. By 217 default, 'SQ' and 'LN' tags will be added to the header 218 text. This option can be changed by unsetting the flag 219 `add_sq_text`. 220 221 add_sq_text : bool 222 do not add 'SQ' and 'LN' tags to header. This option permits 223 construction :term:`SAM` formatted files without a header. 224 225 """ 226 227 # See makeVariantHeader for C constructor 228 def __cinit__(self): 229 self.ptr = NULL 230 231 # Python constructor 232 def __init__(self): 233 self.ptr = bam_hdr_init() 234 if self.ptr is NULL: 235 raise MemoryError("could not create header") 236 237 @classmethod 238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths): 239 240 cdef AlignmentHeader self = AlignmentHeader() 241 cdef char *ctext 242 cdef int l_text 243 cdef int n, x 244 if text is not None: 245 btext = force_bytes(text) 246 ctext = btext 247 l_text = len(btext) 248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char)) 249 if self.ptr.text == NULL: 250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) 251 self.ptr.l_text = l_text 252 memcpy(self.ptr.text, ctext, l_text + 1) 253 254 if reference_names and reference_lengths: 255 reference_names = [force_bytes(ref) for ref in reference_names] 256 257 self.ptr.n_targets = len(reference_names) 258 259 n = sum([len(reference_names) + 1]) 260 self.ptr.target_name = <char**>calloc(n, sizeof(char*)) 261 if self.ptr.target_name == NULL: 262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) 263 264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t)) 265 if self.ptr.target_len == NULL: 266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) 267 268 for x from 0 <= x < self.ptr.n_targets: 269 self.ptr.target_len[x] = reference_lengths[x] 270 name = reference_names[x] 271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char)) 272 if self.ptr.target_name[x] == NULL: 273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) 274 strncpy(self.ptr.target_name[x], name, len(name)) 275 276 return self 277 278 @classmethod 279 def from_text(cls, text): 280 281 reference_names, reference_lengths = [], [] 282 for line in text.splitlines(): 283 if line.startswith("@SQ"): 284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) 285 try: 286 reference_names.append(fields["SN"]) 287 reference_lengths.append(int(fields["LN"])) 288 except KeyError: 289 raise KeyError("incomplete sequence information in '%s'" % str(fields)) 290 except ValueError: 291 raise ValueError("wrong sequence information in '%s'" % str(fields)) 292 293 return cls._from_text_and_lengths(text, reference_names, reference_lengths) 294 295 @classmethod 296 def from_dict(cls, header_dict): 297 298 cdef list lines = [] 299 # first: defined tags 300 for record in VALID_HEADERS: 301 if record in header_dict: 302 data = header_dict[record] 303 if not isinstance(data, VALID_HEADER_TYPES[record]): 304 raise ValueError( 305 "invalid type for record %s: %s, expected %s".format( 306 record, type(data), VALID_HEADER_TYPES[record])) 307 if isinstance(data, Mapping): 308 lines.append(build_header_line(data, record)) 309 else: 310 for fields in header_dict[record]: 311 lines.append(build_header_line(fields, record)) 312 313 # then: user tags (lower case), sorted alphabetically 314 for record, data in sorted(header_dict.items()): 315 if record in VALID_HEADERS: 316 continue 317 if isinstance(data, Mapping): 318 lines.append(build_header_line(data, record)) 319 else: 320 for fields in header_dict[record]: 321 lines.append(build_header_line(fields, record)) 322 323 text = "\n".join(lines) + "\n" 324 325 reference_names, reference_lengths = [], [] 326 if "SQ" in header_dict: 327 for fields in header_dict["SQ"]: 328 try: 329 reference_names.append(fields["SN"]) 330 reference_lengths.append(fields["LN"]) 331 except KeyError: 332 raise KeyError("incomplete sequence information in '%s'" % str(fields)) 333 334 return cls._from_text_and_lengths(text, reference_names, reference_lengths) 335 336 @classmethod 337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): 338 339 if len(reference_names) != len(reference_lengths): 340 raise ValueError("number of reference names and lengths do not match") 341 342 # optionally, if there is no text, add a SAM compatible header to output file. 343 if text is None and add_sq_text: 344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( 345 reference_names, reference_lengths)]) 346 347 return cls._from_text_and_lengths(text, reference_names, reference_lengths) 348 349 def __dealloc__(self): 350 bam_hdr_destroy(self.ptr) 351 self.ptr = NULL 352 353 def __bool__(self): 354 return self.ptr != NULL 355 356 def copy(self): 357 return makeAlignmentHeader(bam_hdr_dup(self.ptr)) 358 359 property nreferences: 360 """int with the number of :term:`reference` sequences in the file. 361 362 This is a read-only attribute.""" 363 def __get__(self): 364 return self.ptr.n_targets 365 366 property references: 367 """tuple with the names of :term:`reference` sequences. This is a 368 read-only attribute""" 369 def __get__(self): 370 t = [] 371 cdef int x 372 for x in range(self.ptr.n_targets): 373 t.append(charptr_to_str(self.ptr.target_name[x])) 374 return tuple(t) 375 376 property lengths: 377 """tuple of the lengths of the :term:`reference` sequences. This is a 378 read-only attribute. The lengths are in the same order as 379 :attr:`pysam.AlignmentFile.references` 380 """ 381 def __get__(self): 382 t = [] 383 cdef int x 384 for x in range(self.ptr.n_targets): 385 t.append(self.ptr.target_len[x]) 386 return tuple(t) 387 388 def _build_sequence_section(self): 389 """return sequence section of header. 390 391 The sequence section is built from the list of reference names and 392 lengths stored in the BAM-file and not from any @SQ entries that 393 are part of the header's text section. 394 """ 395 396 cdef int x 397 text = [] 398 for x in range(self.ptr.n_targets): 399 text.append("@SQ\tSN:{}\tLN:{}\n".format( 400 force_str(self.ptr.target_name[x]), 401 self.ptr.target_len[x])) 402 return "".join(text) 403 404 def to_dict(self): 405 """return two-level dictionary with header information from the file. 406 407 The first level contains the record (``HD``, ``SQ``, etc) and 408 the second level contains the fields (``VN``, ``LN``, etc). 409 410 The parser is validating and will raise an AssertionError if 411 if encounters any record or field tags that are not part of 412 the SAM specification. Use the 413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed 414 header. 415 416 The parsing follows the SAM format specification with the 417 exception of the ``CL`` field. This option will consume the 418 rest of a header line irrespective of any additional fields. 419 This behaviour has been added to accommodate command line 420 options that contain characters that are not valid field 421 separators. 422 423 If no @SQ entries are within the text section of the header, 424 this will be automatically added from the reference names and 425 lengths stored in the binary part of the header. 426 """ 427 result = collections.OrderedDict() 428 429 # convert to python string 430 t = self.__str__() 431 for line in t.split("\n"): 432 line = line.strip(' \0') 433 if not line: 434 continue 435 assert line.startswith("@"), \ 436 "header line without '@': '%s'" % line 437 fields = line[1:].split("\t") 438 record = fields[0] 439 assert record in VALID_HEADER_TYPES, \ 440 "header line with invalid type '%s': '%s'" % (record, line) 441 442 # treat comments 443 if record == "CO": 444 if record not in result: 445 result[record] = [] 446 result[record].append("\t".join( fields[1:])) 447 continue 448 # the following is clumsy as generators do not work? 449 x = {} 450 451 for idx, field in enumerate(fields[1:]): 452 if ":" not in field: 453 raise ValueError("malformatted header: no ':' in field" ) 454 key, value = field.split(":", 1) 455 if key in ("CL",): 456 # special treatment for command line 457 # statements (CL). These might contain 458 # characters that are non-conformant with 459 # the valid field separators in the SAM 460 # header. Thus, in contravention to the 461 # SAM API, consume the rest of the line. 462 key, value = "\t".join(fields[idx+1:]).split(":", 1) 463 x[key] = KNOWN_HEADER_FIELDS[record][key](value) 464 break 465 466 # interpret type of known header record tags, default to str 467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) 468 469 if VALID_HEADER_TYPES[record] == Mapping: 470 if record in result: 471 raise ValueError( 472 "multiple '%s' lines are not permitted" % record) 473 474 result[record] = x 475 elif VALID_HEADER_TYPES[record] == Sequence: 476 if record not in result: result[record] = [] 477 result[record].append(x) 478 479 # if there are no SQ lines in the header, add the 480 # reference names from the information in the bam 481 # file. 482 # 483 # Background: c-samtools keeps the textual part of the 484 # header separate from the list of reference names and 485 # lengths. Thus, if a header contains only SQ lines, 486 # the SQ information is not part of the textual header 487 # and thus are missing from the output. See issue 84. 488 if "SQ" not in result: 489 sq = [] 490 for ref, length in zip(self.references, self.lengths): 491 sq.append({'LN': length, 'SN': ref }) 492 result["SQ"] = sq 493 494 return result 495 496 def as_dict(self): 497 """deprecated: use :meth:`to_dict()`""" 498 return self.to_dict() 499 500 def get_reference_name(self, tid): 501 if tid == -1: 502 return None 503 if not 0 <= tid < self.ptr.n_targets: 504 raise ValueError("reference_id %i out of range 0<=tid<%i" % 505 (tid, self.ptr.n_targets)) 506 return charptr_to_str(self.ptr.target_name[tid]) 507 508 def get_reference_length(self, reference): 509 cdef int tid = self.get_tid(reference) 510 if tid < 0: 511 raise KeyError("unknown reference {}".format(reference)) 512 else: 513 return self.ptr.target_len[tid] 514 515 def is_valid_tid(self, int tid): 516 """ 517 return True if the numerical :term:`tid` is valid; False otherwise. 518 519 Note that the unmapped tid code (-1) counts as an invalid. 520 """ 521 return 0 <= tid < self.ptr.n_targets 522 523 def get_tid(self, reference): 524 """ 525 return the numerical :term:`tid` corresponding to 526 :term:`reference` 527 528 returns -1 if reference is not known. 529 """ 530 reference = force_bytes(reference) 531 tid = bam_name2id(self.ptr, reference) 532 if tid < -1: 533 raise ValueError('could not parse header') 534 return tid 535 536 def __str__(self): 537 '''string with the full contents of the :term:`sam file` header as a 538 string. 539 540 If no @SQ entries are within the text section of the header, 541 this will be automatically added from the reference names and 542 lengths stored in the binary part of the header. 543 544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed 545 representation of the header. 546 ''' 547 text = from_string_and_size(self.ptr.text, self.ptr.l_text) 548 if "@SQ" not in text: 549 text += "\n" + self._build_sequence_section() 550 return text 551 552 # dictionary access methods, for backwards compatibility. 553 def __setitem__(self, key, value): 554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") 555 556 def __getitem__(self, key): 557 return self.to_dict().__getitem__(key) 558 559 def items(self): 560 return self.to_dict().items() 561 562 # PY2 compatibility 563 def iteritems(self): 564 return self.to_dict().items() 565 566 def keys(self): 567 return self.to_dict().keys() 568 569 def values(self): 570 return self.to_dict().values() 571 572 def get(self, *args): 573 return self.to_dict().get(*args) 574 575 def __len__(self): 576 return self.to_dict().__len__() 577 578 def __contains__(self, key): 579 return self.to_dict().__contains__(key) 580 581 582cdef class AlignmentFile(HTSFile): 583 """AlignmentFile(filepath_or_object, mode=None, template=None, 584 reference_names=None, reference_lengths=None, text=NULL, 585 header=None, add_sq_text=False, check_header=True, check_sq=True, 586 reference_filename=None, filename=None, index_filename=None, 587 filepath_index=None, require_index=False, duplicate_filehandle=True, 588 ignore_truncation=False, threads=1) 589 590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. 591 592 If `filepath_or_object` is a string, the file is automatically 593 opened. If `filepath_or_object` is a python File object, the 594 already opened file will be used. 595 596 If the file is opened for reading and an index exists (if file is BAM, a 597 .bai file or if CRAM a .crai file), it will be opened automatically. 598 `index_filename` may be specified explicitly. If the index is not named 599 in the standard manner, not located in the same directory as the 600 BAM/CRAM file, or is remote. Without an index, random access via 601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup` 602 is disabled. 603 604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can 605 be constituted from several sources (see also the samtools format 606 specification): 607 608 1. If `template` is given, the header is copied from a another 609 `AlignmentFile` (`template` must be a 610 :class:`~pysam.AlignmentFile`). 611 612 2. If `header` is given, the header is built from a 613 multi-level dictionary. 614 615 3. If `text` is given, new header text is copied from raw 616 text. 617 618 4. The names (`reference_names`) and lengths 619 (`reference_lengths`) are supplied directly as lists. 620 621 When reading or writing a CRAM file, the filename of a FASTA-formatted 622 reference can be specified with `reference_filename`. 623 624 By default, if a file is opened in mode 'r', it is checked 625 for a valid header (`check_header` = True) and a definition of 626 chromosome names (`check_sq` = True). 627 628 Parameters 629 ---------- 630 mode : string 631 `mode` should be ``r`` for reading or ``w`` for writing. The 632 default is text mode (:term:`SAM`). For binary (:term:`BAM`) 633 I/O you should append ``b`` for compressed or ``u`` for 634 uncompressed :term:`BAM` output. Use ``h`` to output header 635 information in text (:term:`TAM`) mode. Use ``c`` for 636 :term:`CRAM` formatted files. 637 638 If ``b`` is present, it must immediately follow ``r`` or 639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, 640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a 641 :term:`BAM` formatted file for reading, type:: 642 643 f = pysam.AlignmentFile('ex1.bam','rb') 644 645 If mode is not specified, the method will try to auto-detect 646 in the order 'rb', 'r', thus both the following should work:: 647 648 f1 = pysam.AlignmentFile('ex1.bam') 649 f2 = pysam.AlignmentFile('ex1.sam') 650 651 template : AlignmentFile 652 when writing, copy header from file `template`. 653 654 header : dict or AlignmentHeader 655 when writing, build header from a multi-level dictionary. The 656 first level are the four types ('HD', 'SQ', ...). The second 657 level are a list of lines, with each line being a list of 658 tag-value pairs. The header is constructed first from all the 659 defined fields, followed by user tags in alphabetical 660 order. Alternatively, an :class:`~pysam.AlignmentHeader` 661 object can be passed directly. 662 663 text : string 664 when writing, use the string provided as the header 665 666 reference_names : list 667 see reference_lengths 668 669 reference_lengths : list 670 when writing or opening a SAM file without header build header 671 from list of chromosome names and lengths. By default, 'SQ' 672 and 'LN' tags will be added to the header text. This option 673 can be changed by unsetting the flag `add_sq_text`. 674 675 add_sq_text : bool 676 do not add 'SQ' and 'LN' tags to header. This option permits 677 construction :term:`SAM` formatted files without a header. 678 679 add_sam_header : bool 680 when outputting SAM the default is to output a header. This is 681 equivalent to opening the file in 'wh' mode. If this option is 682 set to False, no header will be output. To read such a file, 683 set `check_header=False`. 684 685 check_header : bool 686 obsolete: when reading a SAM file, check if header is present 687 (default=True) 688 689 check_sq : bool 690 when reading, check if SQ entries are present in header 691 (default=True) 692 693 reference_filename : string 694 Path to a FASTA-formatted reference file. Valid only for CRAM files. 695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL 696 specified in the header (``UR`` tag), which are normally used to find 697 the reference. 698 699 index_filename : string 700 Explicit path to the index file. Only needed if the index is not 701 named in the standard manner, not located in the same directory as 702 the BAM/CRAM file, or is remote. An IOError is raised if the index 703 cannot be found or is invalid. 704 705 filepath_index : string 706 Alias for `index_filename`. 707 708 require_index : bool 709 When reading, require that an index file is present and is valid or 710 raise an IOError. (default=False) 711 712 filename : string 713 Alternative to filepath_or_object. Filename of the file 714 to be opened. 715 716 duplicate_filehandle: bool 717 By default, file handles passed either directly or through 718 File-like objects will be duplicated before passing them to 719 htslib. The duplication prevents issues where the same stream 720 will be closed by htslib and through destruction of the 721 high-level python object. Set to False to turn off 722 duplication. 723 724 ignore_truncation: bool 725 Issue a warning, instead of raising an error if the current file 726 appears to be truncated due to a missing EOF marker. Only applies 727 to bgzipped formats. (Default=False) 728 729 format_options: list 730 A list of key=value strings, as accepted by --input-fmt-option and 731 --output-fmt-option in samtools. 732 threads: integer 733 Number of threads to use for compressing/decompressing BAM/CRAM files. 734 Setting threads to > 1 cannot be combined with `ignore_truncation`. 735 (Default=1) 736 """ 737 738 def __cinit__(self, *args, **kwargs): 739 self.htsfile = NULL 740 self.filename = None 741 self.mode = None 742 self.threads = 1 743 self.is_stream = False 744 self.is_remote = False 745 self.index = NULL 746 747 if "filename" in kwargs: 748 args = [kwargs["filename"]] 749 del kwargs["filename"] 750 751 self._open(*args, **kwargs) 752 753 # allocate memory for iterator 754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t)) 755 if self.b == NULL: 756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) 757 758 def has_index(self): 759 """return true if htsfile has an existing (and opened) index. 760 """ 761 return self.index != NULL 762 763 def check_index(self): 764 """return True if index is present. 765 766 Raises 767 ------ 768 769 AttributeError 770 if htsfile is :term:`SAM` formatted and thus has no index. 771 772 ValueError 773 if htsfile is closed or index could not be opened. 774 """ 775 776 if not self.is_open: 777 raise ValueError("I/O operation on closed file") 778 if not self.is_bam and not self.is_cram: 779 raise AttributeError( 780 "AlignmentFile.mapped only available in bam files") 781 if self.index == NULL: 782 raise ValueError( 783 "mapping information not recorded in index " 784 "or index not available") 785 return True 786 787 def _open(self, 788 filepath_or_object, 789 mode=None, 790 AlignmentFile template=None, 791 reference_names=None, 792 reference_lengths=None, 793 reference_filename=None, 794 text=None, 795 header=None, 796 port=None, 797 add_sq_text=True, 798 add_sam_header=True, 799 check_header=True, 800 check_sq=True, 801 index_filename=None, 802 filepath_index=None, 803 require_index=False, 804 referencenames=None, 805 referencelengths=None, 806 duplicate_filehandle=True, 807 ignore_truncation=False, 808 format_options=None, 809 threads=1): 810 '''open a sam, bam or cram formatted file. 811 812 If _open is called on an existing file, the current file 813 will be closed and a new file will be opened. 814 815 ''' 816 cdef char *cfilename = NULL 817 cdef char *creference_filename = NULL 818 cdef char *cindexname = NULL 819 cdef char *cmode = NULL 820 cdef bam_hdr_t * hdr = NULL 821 822 if threads > 1 and ignore_truncation: 823 # This won't raise errors if reaching a truncated alignment, 824 # because bgzf_mt_reader in htslib does not deal with 825 # bgzf_mt_read_block returning non-zero values, contrary 826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) 827 # Better to avoid this (for now) than to produce seemingly correct results. 828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True') 829 self.threads = threads 830 831 # for backwards compatibility: 832 if referencenames is not None: 833 reference_names = referencenames 834 if referencelengths is not None: 835 reference_lengths = referencelengths 836 837 # close a previously opened file 838 if self.is_open: 839 self.close() 840 841 # autodetection for read 842 if mode is None: 843 mode = "r" 844 845 if add_sam_header and mode == "w": 846 mode = "wh" 847 848 assert mode in ("r", "w", "rb", "wb", "wh", 849 "wbu", "rU", "wb0", 850 "rc", "wc"), \ 851 "invalid file opening mode `%s`" % mode 852 853 self.duplicate_filehandle = duplicate_filehandle 854 855 # StringIO not supported 856 if isinstance(filepath_or_object, StringIO): 857 raise NotImplementedError( 858 "access from StringIO objects not supported") 859 # reading from a file descriptor 860 elif isinstance(filepath_or_object, int): 861 self.filename = filepath_or_object 862 filename = None 863 self.is_remote = False 864 self.is_stream = True 865 # reading from a File object or other object with fileno 866 elif hasattr(filepath_or_object, "fileno"): 867 if filepath_or_object.closed: 868 raise ValueError('I/O operation on closed file') 869 self.filename = filepath_or_object 870 # .name can be TextIOWrapper 871 try: 872 filename = encode_filename(str(filepath_or_object.name)) 873 cfilename = filename 874 except AttributeError: 875 filename = None 876 self.is_remote = False 877 self.is_stream = True 878 # what remains is a filename 879 else: 880 self.filename = filename = encode_filename(filepath_or_object) 881 cfilename = filename 882 self.is_remote = hisremote(cfilename) 883 self.is_stream = self.filename == b'-' 884 885 # for htslib, wbu seems to not work 886 if mode == "wbu": 887 mode = "wb0" 888 889 self.mode = force_bytes(mode) 890 self.reference_filename = reference_filename = encode_filename( 891 reference_filename) 892 893 if mode[0] == 'w': 894 # open file for writing 895 896 if not (template or header or text or (reference_names and reference_lengths)): 897 raise ValueError( 898 "either supply options `template`, `header`, `text` or both `reference_names` " 899 "and `reference_lengths` for writing") 900 901 if template: 902 # header is copied, though at the moment not strictly 903 # necessary as AlignmentHeader is immutable. 904 self.header = template.header.copy() 905 elif isinstance(header, AlignmentHeader): 906 self.header = header.copy() 907 elif isinstance(header, Mapping): 908 self.header = AlignmentHeader.from_dict(header) 909 elif reference_names and reference_lengths: 910 self.header = AlignmentHeader.from_references( 911 reference_names, 912 reference_lengths, 913 add_sq_text=add_sq_text, 914 text=text) 915 elif text: 916 self.header = AlignmentHeader.from_text(text) 917 else: 918 raise ValueError("not enough information to construct header. Please provide template, " 919 "header, text or reference_names/reference_lengths") 920 self.htsfile = self._open_htsfile() 921 922 if self.htsfile == NULL: 923 if errno: 924 raise IOError(errno, "could not open alignment file `{}`: {}".format( 925 force_str(filename), 926 force_str(strerror(errno)))) 927 else: 928 raise ValueError("could not open alignment file `{}`".format(force_str(filename))) 929 if format_options and len(format_options): 930 self.add_hts_options(format_options) 931 # set filename with reference sequences. If no filename 932 # is given, the CRAM reference arrays will be built from 933 # the @SQ header in the header 934 if "c" in mode and reference_filename: 935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0): 936 raise ValueError("failure when setting reference filename") 937 938 # write header to htsfile 939 if "b" in mode or "c" in mode or "h" in mode: 940 hdr = self.header.ptr 941 with nogil: 942 sam_hdr_write(self.htsfile, hdr) 943 944 elif mode[0] == "r": 945 # open file for reading 946 self.htsfile = self._open_htsfile() 947 948 if self.htsfile == NULL: 949 if errno: 950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), 951 force_str(strerror(errno)))) 952 else: 953 raise ValueError("could not open alignment file `{}`".format(force_str(filename))) 954 955 if self.htsfile.format.category != sequence_data: 956 raise ValueError("file does not contain alignment data") 957 958 if format_options and len(format_options): 959 self.add_hts_options(format_options) 960 961 self.check_truncation(ignore_truncation) 962 963 # bam/cram files require a valid header 964 if self.is_bam or self.is_cram: 965 with nogil: 966 hdr = sam_hdr_read(self.htsfile) 967 if hdr == NULL: 968 raise ValueError( 969 "file does not have a valid header (mode='%s') " 970 "- is it BAM/CRAM format?" % mode) 971 self.header = makeAlignmentHeader(hdr) 972 else: 973 # in sam files a header is optional. If not given, 974 # user may provide reference names and lengths to built 975 # an on-the-fly header. 976 if reference_names and reference_lengths: 977 # build header from a target names and lengths 978 self.header = AlignmentHeader.from_references( 979 reference_names=reference_names, 980 reference_lengths=reference_lengths, 981 add_sq_text=add_sq_text, 982 text=text) 983 else: 984 with nogil: 985 hdr = sam_hdr_read(self.htsfile) 986 if hdr == NULL: 987 raise ValueError( 988 "SAM? file does not have a valid header (mode='%s'), " 989 "please provide reference_names and reference_lengths") 990 self.header = makeAlignmentHeader(hdr) 991 992 # set filename with reference sequences 993 if self.is_cram and reference_filename: 994 creference_filename = self.reference_filename 995 hts_set_opt(self.htsfile, 996 CRAM_OPT_REFERENCE, 997 creference_filename) 998 999 if check_sq and self.header.nreferences == 0: 1000 raise ValueError( 1001 ("file has no sequences defined (mode='%s') - " 1002 "is it SAM/BAM format? Consider opening with " 1003 "check_sq=False") % mode) 1004 1005 if self.is_bam or self.is_cram: 1006 self.index_filename = index_filename or filepath_index 1007 if self.index_filename: 1008 cindexname = bfile_name = encode_filename(self.index_filename) 1009 1010 if cfilename or cindexname: 1011 with nogil: 1012 self.index = sam_index_load2(self.htsfile, cfilename, cindexname) 1013 1014 if not self.index and (cindexname or require_index): 1015 if errno: 1016 raise IOError(errno, force_str(strerror(errno))) 1017 else: 1018 raise IOError('unable to open index file `%s`' % self.index_filename) 1019 1020 elif require_index: 1021 raise IOError('unable to open index file') 1022 1023 # save start of data section 1024 if not self.is_stream: 1025 self.start_offset = self.tell() 1026 1027 def fetch(self, 1028 contig=None, 1029 start=None, 1030 stop=None, 1031 region=None, 1032 tid=None, 1033 until_eof=False, 1034 multiple_iterators=False, 1035 reference=None, 1036 end=None): 1037 """fetch reads aligned in a :term:`region`. 1038 1039 See :meth:`~pysam.HTSFile.parse_region` for more information 1040 on how genomic regions can be specified. :term:`reference` and 1041 `end` are also accepted for backward compatibility as synonyms 1042 for :term:`contig` and `stop`, respectively. 1043 1044 Without a `contig` or `region` all mapped reads in the file 1045 will be fetched. The reads will be returned ordered by reference 1046 sequence, which will not necessarily be the order within the 1047 file. This mode of iteration still requires an index. If there is 1048 no index, use `until_eof=True`. 1049 1050 If only `contig` is set, all reads aligned to `contig` 1051 will be fetched. 1052 1053 A :term:`SAM` file does not allow random access. If `region` 1054 or `contig` are given, an exception is raised. 1055 1056 Parameters 1057 ---------- 1058 1059 until_eof : bool 1060 1061 If `until_eof` is True, all reads from the current file 1062 position will be returned in order as they are within the 1063 file. Using this option will also fetch unmapped reads. 1064 1065 multiple_iterators : bool 1066 1067 If `multiple_iterators` is True, multiple 1068 iterators on the same file can be used at the same time. The 1069 iterator returned will receive its own copy of a filehandle to 1070 the file effectively re-opening the file. Re-opening a file 1071 creates some overhead, so beware. 1072 1073 Returns 1074 ------- 1075 1076 An iterator over a collection of reads. 1077 1078 Raises 1079 ------ 1080 1081 ValueError 1082 if the genomic coordinates are out of range or invalid or the 1083 file does not permit random access to genomic coordinates. 1084 1085 """ 1086 cdef int rtid, rstart, rstop, has_coord 1087 1088 if not self.is_open: 1089 raise ValueError( "I/O operation on closed file" ) 1090 1091 has_coord, rtid, rstart, rstop = self.parse_region( 1092 contig, start, stop, region, tid, 1093 end=end, reference=reference) 1094 1095 # Turn of re-opening if htsfile is a stream 1096 if self.is_stream: 1097 multiple_iterators = False 1098 1099 if self.is_bam or self.is_cram: 1100 if not until_eof and not self.is_remote: 1101 if not self.has_index(): 1102 raise ValueError( 1103 "fetch called on bamfile without index") 1104 1105 if has_coord: 1106 return IteratorRowRegion( 1107 self, rtid, rstart, rstop, 1108 multiple_iterators=multiple_iterators) 1109 else: 1110 if until_eof: 1111 return IteratorRowAll( 1112 self, 1113 multiple_iterators=multiple_iterators) 1114 else: 1115 # AH: check - reason why no multiple_iterators for 1116 # AllRefs? 1117 return IteratorRowAllRefs( 1118 self, 1119 multiple_iterators=multiple_iterators) 1120 else: 1121 if has_coord: 1122 raise ValueError( 1123 "fetching by region is not available for SAM files") 1124 1125 if multiple_iterators == True: 1126 raise ValueError( 1127 "multiple iterators not implemented for SAM files") 1128 1129 return IteratorRowAll(self, 1130 multiple_iterators=multiple_iterators) 1131 1132 def head(self, n, multiple_iterators=True): 1133 '''return an iterator over the first n alignments. 1134 1135 This iterator is is useful for inspecting the bam-file. 1136 1137 Parameters 1138 ---------- 1139 1140 multiple_iterators : bool 1141 1142 is set to True by default in order to 1143 avoid changing the current file position. 1144 1145 Returns 1146 ------- 1147 1148 an iterator over a collection of reads 1149 1150 ''' 1151 return IteratorRowHead(self, n, 1152 multiple_iterators=multiple_iterators) 1153 1154 def mate(self, AlignedSegment read): 1155 '''return the mate of :class:`~pysam.AlignedSegment` `read`. 1156 1157 .. note:: 1158 1159 Calling this method will change the file position. 1160 This might interfere with any iterators that have 1161 not re-opened the file. 1162 1163 .. note:: 1164 1165 This method is too slow for high-throughput processing. 1166 If a read needs to be processed with its mate, work 1167 from a read name sorted file or, better, cache reads. 1168 1169 Returns 1170 ------- 1171 1172 :class:`~pysam.AlignedSegment` : the mate 1173 1174 Raises 1175 ------ 1176 1177 ValueError 1178 if the read is unpaired or the mate is unmapped 1179 1180 ''' 1181 cdef uint32_t flag = read._delegate.core.flag 1182 1183 if flag & BAM_FPAIRED == 0: 1184 raise ValueError("read %s: is unpaired" % 1185 (read.query_name)) 1186 if flag & BAM_FMUNMAP != 0: 1187 raise ValueError("mate %s: is unmapped" % 1188 (read.query_name)) 1189 1190 # xor flags to get the other mate 1191 cdef int x = BAM_FREAD1 + BAM_FREAD2 1192 flag = (flag ^ x) & x 1193 1194 # Make sure to use a separate file to jump around 1195 # to mate as otherwise the original file position 1196 # will be lost 1197 # The following code is not using the C API and 1198 # could thus be made much quicker, for example 1199 # by using tell and seek. 1200 for mate in self.fetch( 1201 read._delegate.core.mpos, 1202 read._delegate.core.mpos + 1, 1203 tid=read._delegate.core.mtid, 1204 multiple_iterators=True): 1205 if mate.flag & flag != 0 and \ 1206 mate.query_name == read.query_name: 1207 break 1208 else: 1209 raise ValueError("mate not found") 1210 1211 return mate 1212 1213 def pileup(self, 1214 contig=None, 1215 start=None, 1216 stop=None, 1217 region=None, 1218 reference=None, 1219 end=None, 1220 **kwargs): 1221 """perform a :term:`pileup` within a :term:`region`. The region is 1222 specified by :term:`contig`, `start` and `stop` (using 1223 0-based indexing). :term:`reference` and `end` are also accepted for 1224 backward compatibility as synonyms for :term:`contig` and `stop`, 1225 respectively. Alternatively, a samtools 'region' string 1226 can be supplied. 1227 1228 Without 'contig' or 'region' all reads will be used for the 1229 pileup. The reads will be returned ordered by 1230 :term:`contig` sequence, which will not necessarily be the 1231 order within the file. 1232 1233 Note that :term:`SAM` formatted files do not allow random 1234 access. In these files, if a 'region' or 'contig' are 1235 given an exception is raised. 1236 1237 .. note:: 1238 1239 'all' reads which overlap the region are returned. The 1240 first base returned will be the first base of the first 1241 read 'not' necessarily the first base of the region used 1242 in the query. 1243 1244 Parameters 1245 ---------- 1246 1247 truncate : bool 1248 1249 By default, the samtools pileup engine outputs all reads 1250 overlapping a region. If truncate is True and a region is 1251 given, only columns in the exact region specified are 1252 returned. 1253 1254 max_depth : int 1255 Maximum read depth permitted. The default limit is '8000'. 1256 1257 stepper : string 1258 The stepper controls how the iterator advances. 1259 Possible options for the stepper are 1260 1261 ``all`` 1262 skip reads in which any of the following flags are set: 1263 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP 1264 1265 ``nofilter`` 1266 uses every single read turning off any filtering. 1267 1268 ``samtools`` 1269 same filter and read processing as in :term:`csamtools` 1270 pileup. For full compatibility, this requires a 1271 'fastafile' to be given. The following options all pertain 1272 to filtering of the ``samtools`` stepper. 1273 1274 fastafile : :class:`~pysam.FastaFile` object. 1275 1276 This is required for some of the steppers. 1277 1278 ignore_overlaps: bool 1279 1280 If set to True, detect if read pairs overlap and only take 1281 the higher quality base. This is the default. 1282 1283 flag_filter : int 1284 1285 ignore reads where any of the bits in the flag are set. The default is 1286 BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. 1287 1288 flag_require : int 1289 1290 only use reads where certain flags are set. The default is 0. 1291 1292 ignore_orphans: bool 1293 1294 ignore orphans (paired reads that are not in a proper pair). 1295 The default is to ignore orphans. 1296 1297 min_base_quality: int 1298 1299 Minimum base quality. Bases below the minimum quality will 1300 not be output. The default is 13. 1301 1302 adjust_capq_threshold: int 1303 1304 adjust mapping quality. The default is 0 for no 1305 adjustment. The recommended value for adjustment is 50. 1306 1307 min_mapping_quality : int 1308 1309 only use reads above a minimum mapping quality. The default is 0. 1310 1311 compute_baq: bool 1312 1313 re-alignment computing per-Base Alignment Qualities (BAQ). The 1314 default is to do re-alignment. Realignment requires a reference 1315 sequence. If none is present, no realignment will be performed. 1316 1317 redo_baq: bool 1318 1319 recompute per-Base Alignment Quality on the fly ignoring 1320 existing base qualities. The default is False (use existing 1321 base qualities). 1322 1323 Returns 1324 ------- 1325 1326 an iterator over genomic positions. 1327 1328 """ 1329 cdef int rtid, has_coord 1330 cdef int32_t rstart, rstop 1331 1332 if not self.is_open: 1333 raise ValueError("I/O operation on closed file") 1334 1335 has_coord, rtid, rstart, rstop = self.parse_region( 1336 contig, start, stop, region, reference=reference, end=end) 1337 1338 if has_coord: 1339 if not self.has_index(): 1340 raise ValueError("no index available for pileup") 1341 1342 return IteratorColumnRegion(self, 1343 tid=rtid, 1344 start=rstart, 1345 stop=rstop, 1346 **kwargs) 1347 else: 1348 if self.has_index(): 1349 return IteratorColumnAllRefs(self, **kwargs) 1350 else: 1351 return IteratorColumnAll(self, **kwargs) 1352 1353 def count(self, 1354 contig=None, 1355 start=None, 1356 stop=None, 1357 region=None, 1358 until_eof=False, 1359 read_callback="nofilter", 1360 reference=None, 1361 end=None): 1362 '''count the number of reads in :term:`region` 1363 1364 The region is specified by :term:`contig`, `start` and `stop`. 1365 :term:`reference` and `end` are also accepted for backward 1366 compatibility as synonyms for :term:`contig` and `stop`, 1367 respectively. Alternatively, a :term:`samtools` :term:`region` 1368 string can be supplied. 1369 1370 A :term:`SAM` file does not allow random access and if 1371 `region` or `contig` are given, an exception is raised. 1372 1373 Parameters 1374 ---------- 1375 1376 contig : string 1377 reference_name of the genomic region (chromosome) 1378 1379 start : int 1380 start of the genomic region (0-based inclusive) 1381 1382 stop : int 1383 end of the genomic region (0-based exclusive) 1384 1385 region : string 1386 a region string in samtools format. 1387 1388 until_eof : bool 1389 count until the end of the file, possibly including 1390 unmapped reads as well. 1391 1392 read_callback: string or function 1393 1394 select a call-back to ignore reads when counting. It can 1395 be either a string with the following values: 1396 1397 ``all`` 1398 skip reads in which any of the following 1399 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, 1400 BAM_FDUP 1401 1402 ``nofilter`` 1403 uses every single read 1404 1405 Alternatively, `read_callback` can be a function 1406 ``check_read(read)`` that should return True only for 1407 those reads that shall be included in the counting. 1408 1409 reference : string 1410 backward compatible synonym for `contig` 1411 1412 end : int 1413 backward compatible synonym for `stop` 1414 1415 Raises 1416 ------ 1417 1418 ValueError 1419 if the genomic coordinates are out of range or invalid. 1420 1421 ''' 1422 cdef AlignedSegment read 1423 cdef long counter = 0 1424 1425 if not self.is_open: 1426 raise ValueError("I/O operation on closed file") 1427 1428 cdef int filter_method = 0 1429 if read_callback == "all": 1430 filter_method = 1 1431 elif read_callback == "nofilter": 1432 filter_method = 2 1433 1434 for read in self.fetch(contig=contig, 1435 start=start, 1436 stop=stop, 1437 reference=reference, 1438 end=end, 1439 region=region, 1440 until_eof=until_eof): 1441 # apply filter 1442 if filter_method == 1: 1443 # filter = "all" 1444 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): 1445 continue 1446 elif filter_method == 2: 1447 # filter = "nofilter" 1448 pass 1449 else: 1450 if not read_callback(read): 1451 continue 1452 counter += 1 1453 1454 return counter 1455 1456 @cython.boundscheck(False) # we do manual bounds checking 1457 def count_coverage(self, 1458 contig, 1459 start=None, 1460 stop=None, 1461 region=None, 1462 quality_threshold=15, 1463 read_callback='all', 1464 reference=None, 1465 end=None): 1466 """count the coverage of genomic positions by reads in :term:`region`. 1467 1468 The region is specified by :term:`contig`, `start` and `stop`. 1469 :term:`reference` and `end` are also accepted for backward 1470 compatibility as synonyms for :term:`contig` and `stop`, 1471 respectively. Alternatively, a :term:`samtools` :term:`region` 1472 string can be supplied. The coverage is computed per-base [ACGT]. 1473 1474 Parameters 1475 ---------- 1476 1477 contig : string 1478 reference_name of the genomic region (chromosome) 1479 1480 start : int 1481 start of the genomic region (0-based inclusive). If not 1482 given, count from the start of the chromosome. 1483 1484 stop : int 1485 end of the genomic region (0-based exclusive). If not given, 1486 count to the end of the chromosome. 1487 1488 region : string 1489 a region string. 1490 1491 quality_threshold : int 1492 quality_threshold is the minimum quality score (in phred) a 1493 base has to reach to be counted. 1494 1495 read_callback: string or function 1496 1497 select a call-back to ignore reads when counting. It can 1498 be either a string with the following values: 1499 1500 ``all`` 1501 skip reads in which any of the following 1502 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, 1503 BAM_FDUP 1504 1505 ``nofilter`` 1506 uses every single read 1507 1508 Alternatively, `read_callback` can be a function 1509 ``check_read(read)`` that should return True only for 1510 those reads that shall be included in the counting. 1511 1512 reference : string 1513 backward compatible synonym for `contig` 1514 1515 end : int 1516 backward compatible synonym for `stop` 1517 1518 Raises 1519 ------ 1520 1521 ValueError 1522 if the genomic coordinates are out of range or invalid. 1523 1524 Returns 1525 ------- 1526 1527 four array.arrays of the same length in order A C G T : tuple 1528 1529 """ 1530 1531 cdef uint32_t contig_length = self.get_reference_length(contig) 1532 cdef int _start = start if start is not None else 0 1533 cdef int _stop = stop if stop is not None else contig_length 1534 _stop = _stop if _stop < contig_length else contig_length 1535 1536 if _stop == _start: 1537 raise ValueError("interval of size 0") 1538 if _stop < _start: 1539 raise ValueError("interval of size less than 0") 1540 1541 cdef int length = _stop - _start 1542 cdef c_array.array int_array_template = array.array('L', []) 1543 cdef c_array.array count_a 1544 cdef c_array.array count_c 1545 cdef c_array.array count_g 1546 cdef c_array.array count_t 1547 count_a = c_array.clone(int_array_template, length, zero=True) 1548 count_c = c_array.clone(int_array_template, length, zero=True) 1549 count_g = c_array.clone(int_array_template, length, zero=True) 1550 count_t = c_array.clone(int_array_template, length, zero=True) 1551 1552 cdef AlignedSegment read 1553 cdef cython.str seq 1554 cdef c_array.array quality 1555 cdef int qpos 1556 cdef int refpos 1557 cdef int c = 0 1558 cdef int filter_method = 0 1559 1560 1561 if read_callback == "all": 1562 filter_method = 1 1563 elif read_callback == "nofilter": 1564 filter_method = 2 1565 1566 cdef int _threshold = quality_threshold or 0 1567 for read in self.fetch(contig=contig, 1568 reference=reference, 1569 start=start, 1570 stop=stop, 1571 end=end, 1572 region=region): 1573 # apply filter 1574 if filter_method == 1: 1575 # filter = "all" 1576 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): 1577 continue 1578 elif filter_method == 2: 1579 # filter = "nofilter" 1580 pass 1581 else: 1582 if not read_callback(read): 1583 continue 1584 1585 # count 1586 seq = read.seq 1587 if seq is None: 1588 continue 1589 quality = read.query_qualities 1590 1591 for qpos, refpos in read.get_aligned_pairs(True): 1592 if qpos is not None and refpos is not None and \ 1593 _start <= refpos < _stop: 1594 1595 # only check base quality if _threshold > 0 1596 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: 1597 if seq[qpos] == 'A': 1598 count_a.data.as_ulongs[refpos - _start] += 1 1599 if seq[qpos] == 'C': 1600 count_c.data.as_ulongs[refpos - _start] += 1 1601 if seq[qpos] == 'G': 1602 count_g.data.as_ulongs[refpos - _start] += 1 1603 if seq[qpos] == 'T': 1604 count_t.data.as_ulongs[refpos - _start] += 1 1605 1606 return count_a, count_c, count_g, count_t 1607 1608 def find_introns_slow(self, read_iterator): 1609 """Return a dictionary {(start, stop): count} 1610 Listing the intronic sites in the reads (identified by 'N' in the cigar strings), 1611 and their support ( = number of reads ). 1612 1613 read_iterator can be the result of a .fetch(...) call. 1614 Or it can be a generator filtering such reads. Example 1615 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) 1616 """ 1617 res = collections.Counter() 1618 for r in read_iterator: 1619 if 'N' in r.cigarstring: 1620 last_read_pos = False 1621 for read_loc, genome_loc in r.get_aligned_pairs(): 1622 if read_loc is None and last_read_pos: 1623 start = genome_loc 1624 elif read_loc and last_read_pos is None: 1625 stop = genome_loc # we are right exclusive ,so this is correct 1626 res[(start, stop)] += 1 1627 del start 1628 del stop 1629 last_read_pos = read_loc 1630 return res 1631 1632 def find_introns(self, read_iterator): 1633 """Return a dictionary {(start, stop): count} 1634 Listing the intronic sites in the reads (identified by 'N' in the cigar strings), 1635 and their support ( = number of reads ). 1636 1637 read_iterator can be the result of a .fetch(...) call. 1638 Or it can be a generator filtering such reads. Example 1639 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) 1640 """ 1641 cdef: 1642 uint32_t base_position, junc_start, nt 1643 int op 1644 AlignedSegment r 1645 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP 1646 1647 res = collections.Counter() 1648 1649 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position 1650 for r in read_iterator: 1651 base_position = r.pos 1652 1653 for op, nt in r.cigartuples: 1654 if op in match_or_deletion: 1655 base_position += nt 1656 elif op == BAM_CREF_SKIP: 1657 junc_start = base_position 1658 base_position += nt 1659 res[(junc_start, base_position)] += 1 1660 return res 1661 1662 1663 def close(self): 1664 '''closes the :class:`pysam.AlignmentFile`.''' 1665 1666 if self.htsfile == NULL: 1667 return 1668 1669 cdef int ret = hts_close(self.htsfile) 1670 self.htsfile = NULL 1671 1672 if self.index != NULL: 1673 hts_idx_destroy(self.index) 1674 self.index = NULL 1675 1676 self.header = None 1677 1678 if ret < 0: 1679 global errno 1680 if errno == EPIPE: 1681 errno = 0 1682 else: 1683 raise IOError(errno, force_str(strerror(errno))) 1684 1685 def __dealloc__(self): 1686 cdef int ret = 0 1687 1688 if self.htsfile != NULL: 1689 ret = hts_close(self.htsfile) 1690 self.htsfile = NULL 1691 1692 if self.index != NULL: 1693 hts_idx_destroy(self.index) 1694 self.index = NULL 1695 1696 self.header = None 1697 1698 if self.b: 1699 bam_destroy1(self.b) 1700 self.b = NULL 1701 1702 if ret < 0: 1703 global errno 1704 if errno == EPIPE: 1705 errno = 0 1706 else: 1707 raise IOError(errno, force_str(strerror(errno))) 1708 1709 cpdef int write(self, AlignedSegment read) except -1: 1710 ''' 1711 write a single :class:`pysam.AlignedSegment` to disk. 1712 1713 Raises: 1714 ValueError 1715 if the writing failed 1716 1717 Returns: 1718 int : 1719 the number of bytes written. If the file is closed, 1720 this will be 0. 1721 ''' 1722 if not self.is_open: 1723 return 0 1724 1725 if self.header.ptr.n_targets <= read._delegate.core.tid: 1726 raise ValueError( 1727 "AlignedSegment refers to reference number {} that " 1728 "is larger than the number of references ({}) in the header".format( 1729 read._delegate.core.tid, self.header.ptr.n_targets)) 1730 1731 cdef int ret 1732 with nogil: 1733 ret = sam_write1(self.htsfile, 1734 self.header.ptr, 1735 read._delegate) 1736 1737 # kbj: Still need to raise an exception with except -1. Otherwise 1738 # when ret == -1 we get a "SystemError: error return without 1739 # exception set". 1740 if ret < 0: 1741 raise IOError( 1742 "sam_write1 failed with error code {}".format(ret)) 1743 1744 return ret 1745 1746 # context manager interface 1747 def __enter__(self): 1748 return self 1749 1750 def __exit__(self, exc_type, exc_value, traceback): 1751 self.close() 1752 return False 1753 1754 ############################################################### 1755 ############################################################### 1756 ############################################################### 1757 ## properties 1758 ############################################################### 1759 property mapped: 1760 """int with total number of mapped alignments according to the 1761 statistics recorded in the index. This is a read-only 1762 attribute. 1763 (This will be 0 for a CRAM file indexed by a .crai index, as that 1764 index format does not record these statistics.) 1765 """ 1766 def __get__(self): 1767 self.check_index() 1768 cdef int tid 1769 cdef uint64_t total = 0 1770 cdef uint64_t mapped, unmapped 1771 for tid from 0 <= tid < self.header.nreferences: 1772 with nogil: 1773 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) 1774 total += mapped 1775 return total 1776 1777 property unmapped: 1778 """int with total number of unmapped reads according to the statistics 1779 recorded in the index. This number of reads includes the number of reads 1780 without coordinates. This is a read-only attribute. 1781 (This will be 0 for a CRAM file indexed by a .crai index, as that 1782 index format does not record these statistics.) 1783 """ 1784 def __get__(self): 1785 self.check_index() 1786 cdef int tid 1787 cdef uint64_t total = hts_idx_get_n_no_coor(self.index) 1788 cdef uint64_t mapped, unmapped 1789 for tid from 0 <= tid < self.header.nreferences: 1790 with nogil: 1791 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) 1792 total += unmapped 1793 return total 1794 1795 property nocoordinate: 1796 """int with total number of reads without coordinates according to the 1797 statistics recorded in the index, i.e., the statistic printed for "*" 1798 by the ``samtools idxstats`` command. This is a read-only attribute. 1799 (This will be 0 for a CRAM file indexed by a .crai index, as that 1800 index format does not record these statistics.) 1801 """ 1802 def __get__(self): 1803 self.check_index() 1804 cdef uint64_t n 1805 with nogil: 1806 n = hts_idx_get_n_no_coor(self.index) 1807 return n 1808 1809 def get_index_statistics(self): 1810 """return statistics about mapped/unmapped reads per chromosome as 1811 they are stored in the index, similarly to the statistics printed 1812 by the ``samtools idxstats`` command. 1813 1814 CRAI indexes do not record these statistics, so for a CRAM file 1815 with a .crai index the returned statistics will all be 0. 1816 1817 Returns: 1818 list : 1819 a list of records for each chromosome. Each record has the 1820 attributes 'contig', 'mapped', 'unmapped' and 'total'. 1821 """ 1822 1823 self.check_index() 1824 cdef int tid 1825 cdef uint64_t mapped, unmapped 1826 results = [] 1827 # TODO: use header 1828 for tid from 0 <= tid < self.nreferences: 1829 with nogil: 1830 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) 1831 results.append( 1832 IndexStats._make(( 1833 self.get_reference_name(tid), 1834 mapped, 1835 unmapped, 1836 mapped + unmapped))) 1837 1838 return results 1839 1840 ############################################################### 1841 ## file-object like iterator access 1842 ## note: concurrent access will cause errors (see IteratorRow 1843 ## and multiple_iterators) 1844 ## Possible solutions: deprecate or open new file handle 1845 def __iter__(self): 1846 if not self.is_open: 1847 raise ValueError("I/O operation on closed file") 1848 1849 if not self.is_bam and self.header.nreferences == 0: 1850 raise NotImplementedError( 1851 "can not iterate over samfile without header") 1852 return self 1853 1854 cdef bam1_t * getCurrent(self): 1855 return self.b 1856 1857 cdef int cnext(self): 1858 ''' 1859 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. 1860 ''' 1861 cdef int ret 1862 cdef bam_hdr_t * hdr = self.header.ptr 1863 with nogil: 1864 ret = sam_read1(self.htsfile, 1865 hdr, 1866 self.b) 1867 return ret 1868 1869 def __next__(self): 1870 cdef int ret = self.cnext() 1871 if ret >= 0: 1872 return makeAlignedSegment(self.b, self.header) 1873 elif ret == -1: 1874 raise StopIteration 1875 else: 1876 raise IOError(read_failure_reason(ret)) 1877 1878 ########################################### 1879 # methods/properties referencing the header 1880 def is_valid_tid(self, int tid): 1881 """ 1882 return True if the numerical :term:`tid` is valid; False otherwise. 1883 1884 Note that the unmapped tid code (-1) counts as an invalid. 1885 """ 1886 if self.header is None: 1887 raise ValueError("header not available in closed files") 1888 return self.header.is_valid_tid(tid) 1889 1890 def get_tid(self, reference): 1891 """ 1892 return the numerical :term:`tid` corresponding to 1893 :term:`reference` 1894 1895 returns -1 if reference is not known. 1896 """ 1897 if self.header is None: 1898 raise ValueError("header not available in closed files") 1899 return self.header.get_tid(reference) 1900 1901 def get_reference_name(self, tid): 1902 """ 1903 return :term:`reference` name corresponding to numerical :term:`tid` 1904 """ 1905 if self.header is None: 1906 raise ValueError("header not available in closed files") 1907 return self.header.get_reference_name(tid) 1908 1909 def get_reference_length(self, reference): 1910 """ 1911 return :term:`reference` length corresponding to numerical :term:`tid` 1912 """ 1913 if self.header is None: 1914 raise ValueError("header not available in closed files") 1915 return self.header.get_reference_length(reference) 1916 1917 property nreferences: 1918 """int with the number of :term:`reference` sequences in the file. 1919 This is a read-only attribute.""" 1920 def __get__(self): 1921 if self.header: 1922 return self.header.nreferences 1923 else: 1924 raise ValueError("header not available in closed files") 1925 1926 property references: 1927 """tuple with the names of :term:`reference` sequences. This is a 1928 read-only attribute""" 1929 def __get__(self): 1930 if self.header: 1931 return self.header.references 1932 else: 1933 raise ValueError("header not available in closed files") 1934 1935 property lengths: 1936 """tuple of the lengths of the :term:`reference` sequences. This is a 1937 read-only attribute. The lengths are in the same order as 1938 :attr:`pysam.AlignmentFile.references` 1939 1940 """ 1941 def __get__(self): 1942 if self.header: 1943 return self.header.lengths 1944 else: 1945 raise ValueError("header not available in closed files") 1946 1947 # Compatibility functions for pysam < 0.14 1948 property text: 1949 """deprecated, use .header directly""" 1950 def __get__(self): 1951 if self.header: 1952 return self.header.__str__() 1953 else: 1954 raise ValueError("header not available in closed files") 1955 1956 # Compatibility functions for pysam < 0.8.3 1957 def gettid(self, reference): 1958 """deprecated, use get_tid() instead""" 1959 return self.get_tid(reference) 1960 1961 def getrname(self, tid): 1962 """deprecated, use get_reference_name() instead""" 1963 return self.get_reference_name(tid) 1964 1965 1966cdef class IteratorRow: 1967 '''abstract base class for iterators over mapped reads. 1968 1969 Various iterators implement different behaviours for wrapping around 1970 contig boundaries. Examples include: 1971 1972 :class:`pysam.IteratorRowRegion` 1973 iterate within a single contig and a defined region. 1974 1975 :class:`pysam.IteratorRowAll` 1976 iterate until EOF. This iterator will also include unmapped reads. 1977 1978 :class:`pysam.IteratorRowAllRefs` 1979 iterate over all reads in all reference sequences. 1980 1981 The method :meth:`AlignmentFile.fetch` returns an IteratorRow. 1982 1983 .. note:: 1984 1985 It is usually not necessary to create an object of this class 1986 explicitly. It is returned as a result of call to a 1987 :meth:`AlignmentFile.fetch`. 1988 1989 ''' 1990 1991 def __init__(self, AlignmentFile samfile, int multiple_iterators=False): 1992 cdef char *cfilename 1993 cdef char *creference_filename 1994 cdef char *cindexname = NULL 1995 1996 if not samfile.is_open: 1997 raise ValueError("I/O operation on closed file") 1998 1999 # makes sure that samfile stays alive as long as the 2000 # iterator is alive 2001 self.samfile = samfile 2002 2003 # reopen the file - note that this makes the iterator 2004 # slow and causes pileup to slow down significantly. 2005 if multiple_iterators: 2006 2007 cfilename = samfile.filename 2008 with nogil: 2009 self.htsfile = hts_open(cfilename, 'r') 2010 assert self.htsfile != NULL 2011 2012 if samfile.has_index(): 2013 if samfile.index_filename: 2014 cindexname = bindex_filename = encode_filename(samfile.index_filename) 2015 with nogil: 2016 self.index = sam_index_load2(self.htsfile, cfilename, cindexname) 2017 else: 2018 self.index = NULL 2019 2020 # need to advance in newly opened file to position after header 2021 # better: use seek/tell? 2022 with nogil: 2023 hdr = sam_hdr_read(self.htsfile) 2024 if hdr is NULL: 2025 raise IOError("unable to read header information") 2026 self.header = makeAlignmentHeader(hdr) 2027 2028 self.owns_samfile = True 2029 2030 # options specific to CRAM files 2031 if samfile.is_cram and samfile.reference_filename: 2032 creference_filename = samfile.reference_filename 2033 hts_set_opt(self.htsfile, 2034 CRAM_OPT_REFERENCE, 2035 creference_filename) 2036 2037 else: 2038 self.htsfile = samfile.htsfile 2039 self.index = samfile.index 2040 self.owns_samfile = False 2041 self.header = samfile.header 2042 2043 self.retval = 0 2044 2045 self.b = bam_init1() 2046 2047 def __dealloc__(self): 2048 bam_destroy1(self.b) 2049 if self.owns_samfile: 2050 hts_close(self.htsfile) 2051 hts_idx_destroy(self.index) 2052 2053 2054cdef class IteratorRowRegion(IteratorRow): 2055 """*(AlignmentFile samfile, int tid, int beg, int stop, 2056 int multiple_iterators=False)* 2057 2058 iterate over mapped reads in a region. 2059 2060 .. note:: 2061 2062 It is usually not necessary to create an object of this class 2063 explicitly. It is returned as a result of call to a 2064 :meth:`AlignmentFile.fetch`. 2065 2066 """ 2067 2068 def __init__(self, AlignmentFile samfile, 2069 int tid, int beg, int stop, 2070 int multiple_iterators=False): 2071 2072 if not samfile.has_index(): 2073 raise ValueError("no index available for iteration") 2074 2075 IteratorRow.__init__(self, samfile, 2076 multiple_iterators=multiple_iterators) 2077 with nogil: 2078 self.iter = sam_itr_queryi( 2079 self.index, 2080 tid, 2081 beg, 2082 stop) 2083 2084 def __iter__(self): 2085 return self 2086 2087 cdef bam1_t * getCurrent(self): 2088 return self.b 2089 2090 cdef int cnext(self): 2091 '''cversion of iterator. Used by IteratorColumn''' 2092 with nogil: 2093 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), 2094 self.iter, 2095 self.b, 2096 self.htsfile) 2097 2098 def __next__(self): 2099 self.cnext() 2100 if self.retval >= 0: 2101 return makeAlignedSegment(self.b, self.header) 2102 elif self.retval == -1: 2103 raise StopIteration 2104 elif self.retval == -2: 2105 # Note: it is currently not the case that hts_iter_next 2106 # returns -2 for a truncated file. 2107 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 2108 raise IOError('truncated file') 2109 else: 2110 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) 2111 2112 def __dealloc__(self): 2113 hts_itr_destroy(self.iter) 2114 2115 2116cdef class IteratorRowHead(IteratorRow): 2117 """*(AlignmentFile samfile, n, int multiple_iterators=False)* 2118 2119 iterate over first n reads in `samfile` 2120 2121 .. note:: 2122 It is usually not necessary to create an object of this class 2123 explicitly. It is returned as a result of call to a 2124 :meth:`AlignmentFile.head`. 2125 2126 """ 2127 2128 def __init__(self, 2129 AlignmentFile samfile, 2130 int n, 2131 int multiple_iterators=False): 2132 2133 IteratorRow.__init__(self, samfile, 2134 multiple_iterators=multiple_iterators) 2135 2136 self.max_rows = n 2137 self.current_row = 0 2138 2139 def __iter__(self): 2140 return self 2141 2142 cdef bam1_t * getCurrent(self): 2143 return self.b 2144 2145 cdef int cnext(self): 2146 '''cversion of iterator. Used by IteratorColumn''' 2147 cdef int ret 2148 cdef bam_hdr_t * hdr = self.header.ptr 2149 with nogil: 2150 ret = sam_read1(self.htsfile, 2151 hdr, 2152 self.b) 2153 return ret 2154 2155 def __next__(self): 2156 if self.current_row >= self.max_rows: 2157 raise StopIteration 2158 2159 cdef int ret = self.cnext() 2160 if ret >= 0: 2161 self.current_row += 1 2162 return makeAlignedSegment(self.b, self.header) 2163 elif ret == -1: 2164 raise StopIteration 2165 else: 2166 raise IOError(read_failure_reason(ret)) 2167 2168 2169cdef class IteratorRowAll(IteratorRow): 2170 """*(AlignmentFile samfile, int multiple_iterators=False)* 2171 2172 iterate over all reads in `samfile` 2173 2174 .. note:: 2175 2176 It is usually not necessary to create an object of this class 2177 explicitly. It is returned as a result of call to a 2178 :meth:`AlignmentFile.fetch`. 2179 2180 """ 2181 2182 def __init__(self, AlignmentFile samfile, 2183 int multiple_iterators=False): 2184 2185 IteratorRow.__init__(self, samfile, 2186 multiple_iterators=multiple_iterators) 2187 2188 def __iter__(self): 2189 return self 2190 2191 cdef bam1_t * getCurrent(self): 2192 return self.b 2193 2194 cdef int cnext(self): 2195 '''cversion of iterator. Used by IteratorColumn''' 2196 cdef int ret 2197 cdef bam_hdr_t * hdr = self.header.ptr 2198 with nogil: 2199 ret = sam_read1(self.htsfile, 2200 hdr, 2201 self.b) 2202 return ret 2203 2204 def __next__(self): 2205 cdef int ret = self.cnext() 2206 if ret >= 0: 2207 return makeAlignedSegment(self.b, self.header) 2208 elif ret == -1: 2209 raise StopIteration 2210 else: 2211 raise IOError(read_failure_reason(ret)) 2212 2213 2214cdef class IteratorRowAllRefs(IteratorRow): 2215 """iterates over all mapped reads by chaining iterators over each 2216 reference 2217 2218 .. note:: 2219 It is usually not necessary to create an object of this class 2220 explicitly. It is returned as a result of call to a 2221 :meth:`AlignmentFile.fetch`. 2222 2223 """ 2224 2225 def __init__(self, AlignmentFile samfile, 2226 multiple_iterators=False): 2227 2228 IteratorRow.__init__(self, samfile, 2229 multiple_iterators=multiple_iterators) 2230 2231 if not samfile.has_index(): 2232 raise ValueError("no index available for fetch") 2233 2234 self.tid = -1 2235 2236 def nextiter(self): 2237 # get a new iterator for a chromosome. The file 2238 # will not be re-opened. 2239 self.rowiter = IteratorRowRegion(self.samfile, 2240 self.tid, 2241 0, 2242 MAX_POS) 2243 # set htsfile and header of the rowiter 2244 # to the values in this iterator to reflect multiple_iterators 2245 self.rowiter.htsfile = self.htsfile 2246 self.rowiter.header = self.header 2247 2248 # make sure the iterator understand that IteratorRowAllRefs 2249 # has ownership 2250 self.rowiter.owns_samfile = False 2251 2252 def __iter__(self): 2253 return self 2254 2255 def __next__(self): 2256 # Create an initial iterator 2257 if self.tid == -1: 2258 if not self.samfile.nreferences: 2259 raise StopIteration 2260 self.tid = 0 2261 self.nextiter() 2262 2263 while 1: 2264 self.rowiter.cnext() 2265 2266 # If current iterator is not exhausted, return aligned read 2267 if self.rowiter.retval > 0: 2268 return makeAlignedSegment(self.rowiter.b, self.header) 2269 2270 self.tid += 1 2271 2272 # Otherwise, proceed to next reference or stop 2273 if self.tid < self.samfile.nreferences: 2274 self.nextiter() 2275 else: 2276 raise StopIteration 2277 2278 2279cdef class IteratorRowSelection(IteratorRow): 2280 """*(AlignmentFile samfile)* 2281 2282 iterate over reads in `samfile` at a given list of file positions. 2283 2284 .. note:: 2285 It is usually not necessary to create an object of this class 2286 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. 2287 """ 2288 2289 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): 2290 2291 IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) 2292 2293 self.positions = positions 2294 self.current_pos = 0 2295 2296 def __iter__(self): 2297 return self 2298 2299 cdef bam1_t * getCurrent(self): 2300 return self.b 2301 2302 cdef int cnext(self): 2303 '''cversion of iterator''' 2304 # end iteration if out of positions 2305 if self.current_pos >= len(self.positions): return -1 2306 2307 cdef uint64_t pos = self.positions[self.current_pos] 2308 with nogil: 2309 bgzf_seek(hts_get_bgzfp(self.htsfile), 2310 pos, 2311 0) 2312 self.current_pos += 1 2313 2314 cdef int ret 2315 cdef bam_hdr_t * hdr = self.header.ptr 2316 with nogil: 2317 ret = sam_read1(self.htsfile, 2318 hdr, 2319 self.b) 2320 return ret 2321 2322 def __next__(self): 2323 cdef int ret = self.cnext() 2324 if ret >= 0: 2325 return makeAlignedSegment(self.b, self.header) 2326 elif ret == -1: 2327 raise StopIteration 2328 else: 2329 raise IOError(read_failure_reason(ret)) 2330 2331 2332cdef int __advance_nofilter(void *data, bam1_t *b): 2333 '''advance without any read filtering. 2334 ''' 2335 cdef __iterdata * d = <__iterdata*>data 2336 cdef int ret 2337 with nogil: 2338 ret = sam_itr_next(d.htsfile, d.iter, b) 2339 return ret 2340 2341 2342cdef int __advance_raw_nofilter(void *data, bam1_t *b): 2343 '''advance (without iterator) without any read filtering. 2344 ''' 2345 cdef __iterdata * d = <__iterdata*>data 2346 cdef int ret 2347 with nogil: 2348 ret = sam_read1(d.htsfile, d.header, b) 2349 return ret 2350 2351 2352cdef int __advance_all(void *data, bam1_t *b): 2353 '''only use reads for pileup passing basic filters such as 2354 2355 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP 2356 ''' 2357 2358 cdef __iterdata * d = <__iterdata*>data 2359 cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP 2360 cdef int ret 2361 while 1: 2362 with nogil: 2363 ret = sam_itr_next(d.htsfile, d.iter, b) 2364 if ret < 0: 2365 break 2366 if b.core.flag & d.flag_filter: 2367 continue 2368 break 2369 return ret 2370 2371 2372cdef int __advance_raw_all(void *data, bam1_t *b): 2373 '''only use reads for pileup passing basic filters such as 2374 2375 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP 2376 ''' 2377 2378 cdef __iterdata * d = <__iterdata*>data 2379 cdef int ret 2380 while 1: 2381 with nogil: 2382 ret = sam_read1(d.htsfile, d.header, b) 2383 if ret < 0: 2384 break 2385 if b.core.flag & d.flag_filter: 2386 continue 2387 break 2388 return ret 2389 2390 2391cdef int __advance_samtools(void * data, bam1_t * b): 2392 '''advance using same filter and read processing as in 2393 the samtools pileup. 2394 ''' 2395 cdef __iterdata * d = <__iterdata*>data 2396 cdef int ret 2397 cdef int q 2398 2399 while 1: 2400 with nogil: 2401 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) 2402 if ret < 0: 2403 break 2404 if b.core.flag & d.flag_filter: 2405 continue 2406 if d.flag_require and not (b.core.flag & d.flag_require): 2407 continue 2408 2409 # reload sequence 2410 if d.fastafile != NULL and b.core.tid != d.tid: 2411 if d.seq != NULL: 2412 free(d.seq) 2413 d.tid = b.core.tid 2414 with nogil: 2415 d.seq = faidx_fetch_seq( 2416 d.fastafile, 2417 d.header.target_name[d.tid], 2418 0, MAX_POS, 2419 &d.seq_len) 2420 2421 if d.seq == NULL: 2422 raise ValueError( 2423 "reference sequence for '{}' (tid={}) not found".format( 2424 d.header.target_name[d.tid], d.tid)) 2425 2426 # realign read - changes base qualities 2427 if d.seq != NULL and d.compute_baq: 2428 # 4th option to realign is flag: 2429 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 2430 if d.redo_baq: 2431 sam_prob_realn(b, d.seq, d.seq_len, 7) 2432 else: 2433 sam_prob_realn(b, d.seq, d.seq_len, 3) 2434 2435 if d.seq != NULL and d.adjust_capq_threshold > 10: 2436 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) 2437 if q < 0: 2438 continue 2439 elif b.core.qual > q: 2440 b.core.qual = q 2441 2442 if b.core.qual < d.min_mapping_quality: 2443 continue 2444 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): 2445 continue 2446 2447 break 2448 2449 return ret 2450 2451 2452cdef class IteratorColumn: 2453 '''abstract base class for iterators over columns. 2454 2455 IteratorColumn objects wrap the pileup functionality of samtools. 2456 2457 For reasons of efficiency, the iterator points to the current 2458 pileup buffer. The pileup buffer is updated at every iteration. 2459 This might cause some unexpected behaviour. For example, 2460 consider the conversion to a list:: 2461 2462 f = AlignmentFile("file.bam", "rb") 2463 result = list(f.pileup()) 2464 2465 Here, ``result`` will contain ``n`` objects of type 2466 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in 2467 ``result`` will contain the same information. 2468 2469 The desired behaviour can be achieved by list comprehension:: 2470 2471 result = [x.pileups() for x in f.pileup()] 2472 2473 ``result`` will be a list of ``n`` lists of objects of type 2474 :class:`~pysam.PileupRead`. 2475 2476 If the iterator is associated with a :class:`~pysam.Fastafile` 2477 using the :meth:`add_reference` method, then the iterator will 2478 export the current sequence via the methods :meth:`get_sequence` 2479 and :meth:`seq_len`. 2480 2481 See :class:`~AlignmentFile.pileup` for kwargs to the iterator. 2482 ''' 2483 2484 def __cinit__( self, AlignmentFile samfile, **kwargs): 2485 self.samfile = samfile 2486 self.fastafile = kwargs.get("fastafile", None) 2487 self.stepper = kwargs.get("stepper", "samtools") 2488 self.max_depth = kwargs.get("max_depth", 8000) 2489 self.ignore_overlaps = kwargs.get("ignore_overlaps", True) 2490 self.min_base_quality = kwargs.get("min_base_quality", 13) 2491 self.iterdata.seq = NULL 2492 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) 2493 self.iterdata.flag_require = kwargs.get("flag_require", 0) 2494 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) 2495 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) 2496 self.iterdata.compute_baq = kwargs.get("compute_baq", True) 2497 self.iterdata.redo_baq = kwargs.get("redo_baq", False) 2498 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) 2499 2500 self.tid = 0 2501 self.pos = 0 2502 self.n_plp = 0 2503 self.plp = NULL 2504 self.pileup_iter = <bam_mplp_t>NULL 2505 2506 def __iter__(self): 2507 return self 2508 2509 cdef int cnext(self): 2510 '''perform next iteration. 2511 ''' 2512 # do not release gil here because of call-backs 2513 cdef int ret = bam_mplp_auto(self.pileup_iter, 2514 &self.tid, 2515 &self.pos, 2516 &self.n_plp, 2517 &self.plp) 2518 return ret 2519 2520 cdef char * get_sequence(self): 2521 '''return current reference sequence underlying the iterator. 2522 ''' 2523 return self.iterdata.seq 2524 2525 property seq_len: 2526 '''current sequence length.''' 2527 def __get__(self): 2528 return self.iterdata.seq_len 2529 2530 def add_reference(self, FastaFile fastafile): 2531 ''' 2532 add reference sequences in `fastafile` to iterator.''' 2533 self.fastafile = fastafile 2534 if self.iterdata.seq != NULL: 2535 free(self.iterdata.seq) 2536 self.iterdata.tid = -1 2537 self.iterdata.fastafile = self.fastafile.fastafile 2538 2539 def has_reference(self): 2540 ''' 2541 return true if iterator is associated with a reference''' 2542 return self.fastafile 2543 2544 cdef _setup_iterator(self, 2545 int tid, 2546 int start, 2547 int stop, 2548 int multiple_iterators=0): 2549 '''setup the iterator structure''' 2550 2551 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) 2552 self.iterdata.htsfile = self.samfile.htsfile 2553 self.iterdata.iter = self.iter.iter 2554 self.iterdata.seq = NULL 2555 self.iterdata.tid = -1 2556 self.iterdata.header = self.samfile.header.ptr 2557 2558 if self.fastafile is not None: 2559 self.iterdata.fastafile = self.fastafile.fastafile 2560 else: 2561 self.iterdata.fastafile = NULL 2562 2563 # Free any previously allocated memory before reassigning 2564 # pileup_iter 2565 self._free_pileup_iter() 2566 2567 cdef void * data[1] 2568 data[0] = <void*>&self.iterdata 2569 2570 if self.stepper is None or self.stepper == "all": 2571 with nogil: 2572 self.pileup_iter = bam_mplp_init(1, 2573 <bam_plp_auto_f>&__advance_all, 2574 data) 2575 elif self.stepper == "nofilter": 2576 with nogil: 2577 self.pileup_iter = bam_mplp_init(1, 2578 <bam_plp_auto_f>&__advance_nofilter, 2579 data) 2580 elif self.stepper == "samtools": 2581 with nogil: 2582 self.pileup_iter = bam_mplp_init(1, 2583 <bam_plp_auto_f>&__advance_samtools, 2584 data) 2585 else: 2586 raise ValueError( 2587 "unknown stepper option `%s` in IteratorColumn" % self.stepper) 2588 2589 if self.max_depth: 2590 with nogil: 2591 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) 2592 2593 if self.ignore_overlaps: 2594 with nogil: 2595 bam_mplp_init_overlaps(self.pileup_iter) 2596 2597 cdef _setup_raw_rest_iterator(self): 2598 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST''' 2599 2600 self.iter = None 2601 self.iterdata.iter = NULL 2602 self.iterdata.htsfile = self.samfile.htsfile 2603 self.iterdata.seq = NULL 2604 self.iterdata.tid = -1 2605 self.iterdata.header = self.samfile.header.ptr 2606 2607 if self.fastafile is not None: 2608 self.iterdata.fastafile = self.fastafile.fastafile 2609 else: 2610 self.iterdata.fastafile = NULL 2611 2612 # Free any previously allocated memory before reassigning 2613 # pileup_iter 2614 self._free_pileup_iter() 2615 2616 cdef void * data[1] 2617 data[0] = <void*>&self.iterdata 2618 2619 if self.stepper is None or self.stepper == "all": 2620 with nogil: 2621 self.pileup_iter = bam_mplp_init(1, 2622 <bam_plp_auto_f>&__advance_raw_all, 2623 data) 2624 elif self.stepper == "nofilter": 2625 with nogil: 2626 self.pileup_iter = bam_mplp_init(1, 2627 <bam_plp_auto_f>&__advance_raw_nofilter, 2628 data) 2629 elif self.stepper == "samtools": 2630 with nogil: 2631 self.pileup_iter = bam_mplp_init(1, 2632 <bam_plp_auto_f>&__advance_samtools, 2633 data) 2634 else: 2635 raise ValueError( 2636 "unknown stepper option `%s` in IteratorColumn" % self.stepper) 2637 2638 if self.max_depth: 2639 with nogil: 2640 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) 2641 2642 if self.ignore_overlaps: 2643 with nogil: 2644 bam_mplp_init_overlaps(self.pileup_iter) 2645 2646 cdef reset(self, tid, start, stop): 2647 '''reset iterator position. 2648 2649 This permits using the iterator multiple times without 2650 having to incur the full set-up costs. 2651 ''' 2652 if self.iter is None: 2653 raise TypeError("Raw iterator set up without region cannot be reset") 2654 2655 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) 2656 self.iterdata.iter = self.iter.iter 2657 2658 # invalidate sequence if different tid 2659 if self.tid != tid: 2660 if self.iterdata.seq != NULL: 2661 free(self.iterdata.seq) 2662 self.iterdata.seq = NULL 2663 self.iterdata.tid = -1 2664 2665 # self.pileup_iter = bam_mplp_init(1 2666 # &__advancepileup, 2667 # &self.iterdata) 2668 with nogil: 2669 bam_mplp_reset(self.pileup_iter) 2670 2671 cdef _free_pileup_iter(self): 2672 '''free the memory alloc'd by bam_plp_init. 2673 2674 This is needed before setup_iterator allocates another 2675 pileup_iter, or else memory will be lost. ''' 2676 if self.pileup_iter != <bam_mplp_t>NULL: 2677 with nogil: 2678 bam_mplp_reset(self.pileup_iter) 2679 bam_mplp_destroy(self.pileup_iter) 2680 self.pileup_iter = <bam_mplp_t>NULL 2681 2682 def __dealloc__(self): 2683 # reset in order to avoid memory leak messages for iterators 2684 # that have not been fully consumed 2685 self._free_pileup_iter() 2686 self.plp = <const bam_pileup1_t*>NULL 2687 2688 if self.iterdata.seq != NULL: 2689 free(self.iterdata.seq) 2690 self.iterdata.seq = NULL 2691 2692 # backwards compatibility 2693 2694 def hasReference(self): 2695 return self.has_reference() 2696 cdef char * getSequence(self): 2697 return self.get_sequence() 2698 def addReference(self, FastaFile fastafile): 2699 return self.add_reference(fastafile) 2700 2701 2702cdef class IteratorColumnRegion(IteratorColumn): 2703 '''iterates over a region only. 2704 ''' 2705 def __cinit__(self, 2706 AlignmentFile samfile, 2707 int tid = 0, 2708 int start = 0, 2709 int stop = MAX_POS, 2710 int truncate = False, 2711 int multiple_iterators = True, 2712 **kwargs ): 2713 2714 # initialize iterator. Multiple iterators not available 2715 # for CRAM. 2716 if multiple_iterators and samfile.is_cram: 2717 warnings.warn("multiple_iterators not implemented for CRAM") 2718 multiple_iterators = False 2719 2720 self._setup_iterator(tid, start, stop, multiple_iterators) 2721 self.start = start 2722 self.stop = stop 2723 self.truncate = truncate 2724 2725 def __next__(self): 2726 2727 cdef int n 2728 2729 while 1: 2730 n = self.cnext() 2731 if n < 0: 2732 raise ValueError("error during iteration" ) 2733 2734 if n == 0: 2735 raise StopIteration 2736 2737 if self.truncate: 2738 if self.start > self.pos: 2739 continue 2740 if self.pos >= self.stop: 2741 raise StopIteration 2742 2743 return makePileupColumn(&self.plp, 2744 self.tid, 2745 self.pos, 2746 self.n_plp, 2747 self.min_base_quality, 2748 self.iterdata.seq, 2749 self.samfile.header) 2750 2751 2752cdef class IteratorColumnAllRefs(IteratorColumn): 2753 """iterates over all columns by chaining iterators over each reference 2754 """ 2755 2756 def __cinit__(self, 2757 AlignmentFile samfile, 2758 **kwargs): 2759 2760 # no iteration over empty files 2761 if not samfile.nreferences: 2762 raise StopIteration 2763 2764 # initialize iterator 2765 self._setup_iterator(self.tid, 0, MAX_POS, 1) 2766 2767 def __next__(self): 2768 2769 cdef int n 2770 while 1: 2771 n = self.cnext() 2772 if n < 0: 2773 raise ValueError("error during iteration") 2774 2775 # proceed to next reference or stop 2776 if n == 0: 2777 self.tid += 1 2778 if self.tid < self.samfile.nreferences: 2779 self._setup_iterator(self.tid, 0, MAX_POS, 0) 2780 else: 2781 raise StopIteration 2782 continue 2783 2784 # return result, if within same reference 2785 return makePileupColumn(&self.plp, 2786 self.tid, 2787 self.pos, 2788 self.n_plp, 2789 self.min_base_quality, 2790 self.iterdata.seq, 2791 self.samfile.header) 2792 2793 2794cdef class IteratorColumnAll(IteratorColumn): 2795 """iterates over all columns, without using an index 2796 """ 2797 2798 def __cinit__(self, 2799 AlignmentFile samfile, 2800 **kwargs): 2801 2802 self._setup_raw_rest_iterator() 2803 2804 def __next__(self): 2805 2806 cdef int n 2807 n = self.cnext() 2808 if n < 0: 2809 raise ValueError("error during iteration") 2810 2811 if n == 0: 2812 raise StopIteration 2813 2814 return makePileupColumn(&self.plp, 2815 self.tid, 2816 self.pos, 2817 self.n_plp, 2818 self.min_base_quality, 2819 self.iterdata.seq, 2820 self.samfile.header) 2821 2822 2823cdef class SNPCall: 2824 '''the results of a SNP call.''' 2825 cdef int _tid 2826 cdef int _pos 2827 cdef char _reference_base 2828 cdef char _genotype 2829 cdef int _consensus_quality 2830 cdef int _snp_quality 2831 cdef int _rms_mapping_quality 2832 cdef int _coverage 2833 2834 property tid: 2835 '''the chromosome ID as is defined in the header''' 2836 def __get__(self): 2837 return self._tid 2838 2839 property pos: 2840 '''nucleotide position of SNP.''' 2841 def __get__(self): return self._pos 2842 2843 property reference_base: 2844 '''reference base at pos. ``N`` if no reference sequence supplied.''' 2845 def __get__(self): return from_string_and_size( &self._reference_base, 1 ) 2846 2847 property genotype: 2848 '''the genotype called.''' 2849 def __get__(self): return from_string_and_size( &self._genotype, 1 ) 2850 2851 property consensus_quality: 2852 '''the genotype quality (Phred-scaled).''' 2853 def __get__(self): return self._consensus_quality 2854 2855 property snp_quality: 2856 '''the snp quality (Phred scaled) - probability of consensus being 2857 identical to reference sequence.''' 2858 def __get__(self): return self._snp_quality 2859 2860 property mapping_quality: 2861 '''the root mean square (rms) of the mapping quality of all reads 2862 involved in the call.''' 2863 def __get__(self): return self._rms_mapping_quality 2864 2865 property coverage: 2866 '''coverage or read depth - the number of reads involved in the call.''' 2867 def __get__(self): return self._coverage 2868 2869 def __str__(self): 2870 2871 return "\t".join( map(str, ( 2872 self.tid, 2873 self.pos, 2874 self.reference_base, 2875 self.genotype, 2876 self.consensus_quality, 2877 self.snp_quality, 2878 self.mapping_quality, 2879 self.coverage ) ) ) 2880 2881 2882cdef class IndexedReads: 2883 """Index a Sam/BAM-file by query name while keeping the 2884 original sort order intact. 2885 2886 The index is kept in memory and can be substantial. 2887 2888 By default, the file is re-openend to avoid conflicts if multiple 2889 operators work on the same file. Set `multiple_iterators` = False 2890 to not re-open `samfile`. 2891 2892 Parameters 2893 ---------- 2894 2895 samfile : AlignmentFile 2896 File to be indexed. 2897 2898 multiple_iterators : bool 2899 Flag indicating whether the file should be reopened. Reopening prevents 2900 existing iterators being affected by the indexing. 2901 2902 """ 2903 2904 def __init__(self, AlignmentFile samfile, int multiple_iterators=True): 2905 cdef char *cfilename 2906 2907 # makes sure that samfile stays alive as long as this 2908 # object is alive. 2909 self.samfile = samfile 2910 cdef bam_hdr_t * hdr = NULL 2911 assert samfile.is_bam, "can only apply IndexReads on bam files" 2912 2913 # multiple_iterators the file - note that this makes the iterator 2914 # slow and causes pileup to slow down significantly. 2915 if multiple_iterators: 2916 cfilename = samfile.filename 2917 with nogil: 2918 self.htsfile = hts_open(cfilename, 'r') 2919 if self.htsfile == NULL: 2920 raise OSError("unable to reopen htsfile") 2921 2922 # need to advance in newly opened file to position after header 2923 # better: use seek/tell? 2924 with nogil: 2925 hdr = sam_hdr_read(self.htsfile) 2926 if hdr == NULL: 2927 raise OSError("unable to read header information") 2928 self.header = makeAlignmentHeader(hdr) 2929 self.owns_samfile = True 2930 else: 2931 self.htsfile = self.samfile.htsfile 2932 self.header = samfile.header 2933 self.owns_samfile = False 2934 2935 def build(self): 2936 '''build the index.''' 2937 2938 self.index = collections.defaultdict(list) 2939 2940 # this method will start indexing from the current file position 2941 cdef int ret = 1 2942 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t)) 2943 if b == NULL: 2944 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) 2945 2946 cdef uint64_t pos 2947 cdef bam_hdr_t * hdr = self.header.ptr 2948 2949 while ret > 0: 2950 with nogil: 2951 pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) 2952 ret = sam_read1(self.htsfile, 2953 hdr, 2954 b) 2955 2956 if ret > 0: 2957 qname = charptr_to_str(pysam_bam_get_qname(b)) 2958 self.index[qname].append(pos) 2959 2960 bam_destroy1(b) 2961 2962 def find(self, query_name): 2963 '''find `query_name` in index. 2964 2965 Returns 2966 ------- 2967 2968 IteratorRowSelection 2969 Returns an iterator over all reads with query_name. 2970 2971 Raises 2972 ------ 2973 2974 KeyError 2975 if the `query_name` is not in the index. 2976 2977 ''' 2978 if query_name in self.index: 2979 return IteratorRowSelection( 2980 self.samfile, 2981 self.index[query_name], 2982 multiple_iterators = False) 2983 else: 2984 raise KeyError("read %s not found" % query_name) 2985 2986 def __dealloc__(self): 2987 if self.owns_samfile: 2988 hts_close(self.htsfile) 2989