Bio/SeqIO/TwoBitIO.py

# Copyright 2020 by Michiel de Hoon
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.

This parser reads the index stored in the twoBit file, as well as the masked
regions and the N's for each sequence. It also creates sequence data objects
(_TwoBitSequenceData objects), which support only two methods: __len__ and
__getitem__. The former will return the length of the sequence, while the
latter returns the sequence (as a bytes object) for the requested region.

Using the information in the index, the __getitem__ method calculates the file
position at which the requested region starts, and only reads the requested
sequence region. Note that the full sequence of a record is loaded only if
specifically requested, making the parser memory-efficient.

The TwoBitIterator object implements the __getitem__, keys, and __len__
methods that allow it to be used as a dictionary.
"""
# The .2bit file format is defined by UCSC as follows
# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
#
#
# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
# randomly-accessible format. The file contains masking information as well
# as the DNA itself.
#
# The file begins with a 16-byte header containing the following fields:
#
# signature - the number 0x1A412743 in the architecture of the machine that
#             created the file
# version - zero for now. Readers should abort if they see a version number
#           higher than 0
# sequenceCount - the number of sequences in the file
# reserved - always zero for now
#
# All fields are 32 bits unless noted. If the signature value is not as
# given, the reader program should byte-swap the signature and check if the
# swapped version matches. If so, all multiple-byte entities in the file
# will have to be byte-swapped. This enables these binary files to be used
# unchanged on different architectures.
#
# The header is followed by a file index, which contains one entry for each
# sequence. Each index entry contains three fields:
#
# nameSize - a byte containing the length of the name field
# name - the sequence name itself (in ASCII-compatible byte string), of
#        variable length depending on nameSize
# offset - the 32-bit offset of the sequence data relative to the start of
#          the file, not aligned to any 4-byte padding boundary
#
# The index is followed by the sequence records, which contain nine fields:
#
# dnaSize - number of bases of DNA in the sequence
# nBlockCount - the number of blocks of Ns in the file (representing unknown
#               sequence)
# nBlockStarts - an array of length nBlockCount of 32 bit integers
#                indicating the (0-based) starting position of a block of Ns
# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
#               the length of a block of Ns
# maskBlockCount - the number of masked (lower-case) blocks
# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
#                   indicating the (0-based) starting position of a masked block
# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
#                  indicating the length of a masked block
# reserved - always zero for now
# packedDna - the DNA packed to two bits per base, represented as so:
#             T - 00, C - 01, A - 10, G - 11. The first base is in the most
#             significant 2-bit byte; the last base is in the least significan
#             2 bits. For example, the sequence TCAG is represented as 00011011.
import numpy

from Bio.Seq import Seq
from Bio.Seq import SequenceDataAbstractBaseClass
from Bio.SeqRecord import SeqRecord

from . import _twoBitIO
from .Interfaces import SequenceIterator


class _TwoBitSequenceData(SequenceDataAbstractBaseClass):
    """Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).

    Objects of this class store the file position at which the sequence data
    start, the sequence length, and the start and end position of unknown (N)
    and masked (lowercase) letters in the sequence.

    Only two methods are provided: __len__ and __getitem__. The former will
    return the length of the sequence, while the latter returns the sequence
    (as a bytes object) for the requested region. The full sequence of a record
    is loaded only if explicitly requested.
    """

    __slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks")

    def __init__(self, stream, offset, length):
        """Initialize the file stream and file position of the sequence data."""
        self.stream = stream
        self.offset = offset
        self.length = length
        super().__init__()

    def __getitem__(self, key):
        length = self.length
        if isinstance(key, slice):
            start, end, step = key.indices(length)
            size = len(range(start, end, step))
            if size == 0:
                return b""
        else:
            if key < 0:
                key += length
                if key < 0:
                    raise IndexError("index out of range")
            start = key
            end = key + 1
            step = 1
            size = 1
        byteStart = start // 4
        byteEnd = (end + 3) // 4
        byteSize = byteEnd - byteStart
        stream = self.stream
        try:
            stream.seek(self.offset + byteStart)
        except ValueError as exception:
            if str(exception) == "seek of closed file":
                raise ValueError("cannot retrieve sequence: file is closed") from None
            raise
        data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
        sequence = _twoBitIO.convert(
            data, start, end, step, self.nBlocks, self.maskBlocks
        )
        if isinstance(key, slice):
            return sequence
        else:  # single nucleotide
            return ord(sequence)

    def __len__(self):
        return self.length

    def upper(self):
        """Remove the sequence mask."""
        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
        data.nBlocks = self.nBlocks[:, :]
        data.maskBlocks = numpy.empty((0, 2), dtype="uint32")
        return data

    def lower(self):
        """Extend the sequence mask to the full sequence."""
        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
        data.nBlocks = self.nBlocks[:, :]
        data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32")
        return data


class TwoBitIterator(SequenceIterator):
    """Parser for UCSC twoBit (.2bit) files."""

    def __init__(self, source):
        """Read the file index."""
        super().__init__(source, mode="b", fmt="twoBit")
        # wait to close the file until the TwoBitIterator goes out of scope:
        self.should_close_stream = False
        stream = self.stream
        data = stream.read(4)
        if not data:
            raise ValueError("Empty file.")
        byteorders = ("little", "big")
        dtypes = ("<u4", ">u4")
        for byteorder, dtype in zip(byteorders, dtypes):
            signature = int.from_bytes(data, byteorder)
            if signature == 0x1A412743:
                break
        else:
            raise ValueError("Unknown signature")
        self.byteorder = byteorder
        data = stream.read(4)
        version = int.from_bytes(data, byteorder, signed=False)
        if version == 1:
            raise ValueError(
                "version-1 twoBit files with 64-bit offsets for index are currently not supported"
            )
        if version != 0:
            raise ValueError("Found unexpected file version %u; aborting" % version)
        data = stream.read(4)
        sequenceCount = int.from_bytes(data, byteorder, signed=False)
        data = stream.read(4)
        reserved = int.from_bytes(data, byteorder, signed=False)
        if reserved != 0:
            raise ValueError("Found non-zero reserved field; aborting")
        sequences = {}
        for i in range(sequenceCount):
            data = stream.read(1)
            nameSize = int.from_bytes(data, byteorder, signed=False)
            data = stream.read(nameSize)
            name = data.decode("ASCII")
            data = stream.read(4)
            offset = int.from_bytes(data, byteorder, signed=False)
            sequences[name] = (stream, offset)
        self.sequences = sequences
        for name, (stream, offset) in sequences.items():
            stream.seek(offset)
            data = stream.read(4)
            dnaSize = int.from_bytes(data, byteorder, signed=False)
            sequence = _TwoBitSequenceData(stream, offset, dnaSize)
            data = stream.read(4)
            nBlockCount = int.from_bytes(data, byteorder, signed=False)
            nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
            nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
            sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
            sequence.nBlocks[:, 0] = nBlockStarts
            sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
            data = stream.read(4)
            maskBlockCount = int.from_bytes(data, byteorder, signed=False)
            maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
            maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
            sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
            sequence.maskBlocks[:, 0] = maskBlockStarts
            sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
            data = stream.read(4)
            reserved = int.from_bytes(data, byteorder, signed=False)
            if reserved != 0:
                raise ValueError("Found non-zero reserved field %u" % reserved)
            sequence.offset = stream.tell()
            sequences[name] = sequence

    def parse(self, stream):
        """Iterate over the sequences in the file."""
        for name, sequence in self.sequences.items():
            sequence = Seq(sequence)
            record = SeqRecord(sequence, id=name)
            yield record

    def __getitem__(self, name):
        try:
            sequence = self.sequences[name]
        except ValueError:
            raise KeyError(name) from None
        sequence = Seq(sequence)
        return SeqRecord(sequence, id=name)

    def keys(self):
        """Return a list with the names of the sequences in the file."""
        return self.sequences.keys()

    def __len__(self):
        return len(self.sequences)