1# Copyright 2020 by Michiel de Hoon
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.
8
9This parser reads the index stored in the twoBit file, as well as the masked
10regions and the N's for each sequence. It also creates sequence data objects
11(_TwoBitSequenceData objects), which support only two methods: __len__ and
12__getitem__. The former will return the length of the sequence, while the
13latter returns the sequence (as a bytes object) for the requested region.
14
15Using the information in the index, the __getitem__ method calculates the file
16position at which the requested region starts, and only reads the requested
17sequence region. Note that the full sequence of a record is loaded only if
18specifically requested, making the parser memory-efficient.
19
20The TwoBitIterator object implements the __getitem__, keys, and __len__
21methods that allow it to be used as a dictionary.
22"""
23# The .2bit file format is defined by UCSC as follows
24# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
25#
26#
27# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
28# randomly-accessible format. The file contains masking information as well
29# as the DNA itself.
30#
31# The file begins with a 16-byte header containing the following fields:
32#
33# signature - the number 0x1A412743 in the architecture of the machine that
34#             created the file
35# version - zero for now. Readers should abort if they see a version number
36#           higher than 0
37# sequenceCount - the number of sequences in the file
38# reserved - always zero for now
39#
40# All fields are 32 bits unless noted. If the signature value is not as
41# given, the reader program should byte-swap the signature and check if the
42# swapped version matches. If so, all multiple-byte entities in the file
43# will have to be byte-swapped. This enables these binary files to be used
44# unchanged on different architectures.
45#
46# The header is followed by a file index, which contains one entry for each
47# sequence. Each index entry contains three fields:
48#
49# nameSize - a byte containing the length of the name field
50# name - the sequence name itself (in ASCII-compatible byte string), of
51#        variable length depending on nameSize
52# offset - the 32-bit offset of the sequence data relative to the start of
53#          the file, not aligned to any 4-byte padding boundary
54#
55# The index is followed by the sequence records, which contain nine fields:
56#
57# dnaSize - number of bases of DNA in the sequence
58# nBlockCount - the number of blocks of Ns in the file (representing unknown
59#               sequence)
60# nBlockStarts - an array of length nBlockCount of 32 bit integers
61#                indicating the (0-based) starting position of a block of Ns
62# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
63#               the length of a block of Ns
64# maskBlockCount - the number of masked (lower-case) blocks
65# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
66#                   indicating the (0-based) starting position of a masked block
67# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
68#                  indicating the length of a masked block
69# reserved - always zero for now
70# packedDna - the DNA packed to two bits per base, represented as so:
71#             T - 00, C - 01, A - 10, G - 11. The first base is in the most
72#             significant 2-bit byte; the last base is in the least significan
73#             2 bits. For example, the sequence TCAG is represented as 00011011.
74import numpy
75
76from Bio.Seq import Seq
77from Bio.Seq import SequenceDataAbstractBaseClass
78from Bio.SeqRecord import SeqRecord
79
80from . import _twoBitIO
81from .Interfaces import SequenceIterator
82
83
84class _TwoBitSequenceData(SequenceDataAbstractBaseClass):
85    """Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).
86
87    Objects of this class store the file position at which the sequence data
88    start, the sequence length, and the start and end position of unknown (N)
89    and masked (lowercase) letters in the sequence.
90
91    Only two methods are provided: __len__ and __getitem__. The former will
92    return the length of the sequence, while the latter returns the sequence
93    (as a bytes object) for the requested region. The full sequence of a record
94    is loaded only if explicitly requested.
95    """
96
97    __slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks")
98
99    def __init__(self, stream, offset, length):
100        """Initialize the file stream and file position of the sequence data."""
101        self.stream = stream
102        self.offset = offset
103        self.length = length
104        super().__init__()
105
106    def __getitem__(self, key):
107        length = self.length
108        if isinstance(key, slice):
109            start, end, step = key.indices(length)
110            size = len(range(start, end, step))
111            if size == 0:
112                return b""
113        else:
114            if key < 0:
115                key += length
116                if key < 0:
117                    raise IndexError("index out of range")
118            start = key
119            end = key + 1
120            step = 1
121            size = 1
122        byteStart = start // 4
123        byteEnd = (end + 3) // 4
124        byteSize = byteEnd - byteStart
125        stream = self.stream
126        try:
127            stream.seek(self.offset + byteStart)
128        except ValueError as exception:
129            if str(exception) == "seek of closed file":
130                raise ValueError("cannot retrieve sequence: file is closed") from None
131            raise
132        data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
133        sequence = _twoBitIO.convert(
134            data, start, end, step, self.nBlocks, self.maskBlocks
135        )
136        if isinstance(key, slice):
137            return sequence
138        else:  # single nucleotide
139            return ord(sequence)
140
141    def __len__(self):
142        return self.length
143
144    def upper(self):
145        """Remove the sequence mask."""
146        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
147        data.nBlocks = self.nBlocks[:, :]
148        data.maskBlocks = numpy.empty((0, 2), dtype="uint32")
149        return data
150
151    def lower(self):
152        """Extend the sequence mask to the full sequence."""
153        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
154        data.nBlocks = self.nBlocks[:, :]
155        data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32")
156        return data
157
158
159class TwoBitIterator(SequenceIterator):
160    """Parser for UCSC twoBit (.2bit) files."""
161
162    def __init__(self, source):
163        """Read the file index."""
164        super().__init__(source, mode="b", fmt="twoBit")
165        # wait to close the file until the TwoBitIterator goes out of scope:
166        self.should_close_stream = False
167        stream = self.stream
168        data = stream.read(4)
169        if not data:
170            raise ValueError("Empty file.")
171        byteorders = ("little", "big")
172        dtypes = ("<u4", ">u4")
173        for byteorder, dtype in zip(byteorders, dtypes):
174            signature = int.from_bytes(data, byteorder)
175            if signature == 0x1A412743:
176                break
177        else:
178            raise ValueError("Unknown signature")
179        self.byteorder = byteorder
180        data = stream.read(4)
181        version = int.from_bytes(data, byteorder, signed=False)
182        if version == 1:
183            raise ValueError(
184                "version-1 twoBit files with 64-bit offsets for index are currently not supported"
185            )
186        if version != 0:
187            raise ValueError("Found unexpected file version %u; aborting" % version)
188        data = stream.read(4)
189        sequenceCount = int.from_bytes(data, byteorder, signed=False)
190        data = stream.read(4)
191        reserved = int.from_bytes(data, byteorder, signed=False)
192        if reserved != 0:
193            raise ValueError("Found non-zero reserved field; aborting")
194        sequences = {}
195        for i in range(sequenceCount):
196            data = stream.read(1)
197            nameSize = int.from_bytes(data, byteorder, signed=False)
198            data = stream.read(nameSize)
199            name = data.decode("ASCII")
200            data = stream.read(4)
201            offset = int.from_bytes(data, byteorder, signed=False)
202            sequences[name] = (stream, offset)
203        self.sequences = sequences
204        for name, (stream, offset) in sequences.items():
205            stream.seek(offset)
206            data = stream.read(4)
207            dnaSize = int.from_bytes(data, byteorder, signed=False)
208            sequence = _TwoBitSequenceData(stream, offset, dnaSize)
209            data = stream.read(4)
210            nBlockCount = int.from_bytes(data, byteorder, signed=False)
211            nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
212            nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
213            sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
214            sequence.nBlocks[:, 0] = nBlockStarts
215            sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
216            data = stream.read(4)
217            maskBlockCount = int.from_bytes(data, byteorder, signed=False)
218            maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
219            maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
220            sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
221            sequence.maskBlocks[:, 0] = maskBlockStarts
222            sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
223            data = stream.read(4)
224            reserved = int.from_bytes(data, byteorder, signed=False)
225            if reserved != 0:
226                raise ValueError("Found non-zero reserved field %u" % reserved)
227            sequence.offset = stream.tell()
228            sequences[name] = sequence
229
230    def parse(self, stream):
231        """Iterate over the sequences in the file."""
232        for name, sequence in self.sequences.items():
233            sequence = Seq(sequence)
234            record = SeqRecord(sequence, id=name)
235            yield record
236
237    def __getitem__(self, name):
238        try:
239            sequence = self.sequences[name]
240        except ValueError:
241            raise KeyError(name) from None
242        sequence = Seq(sequence)
243        return SeqRecord(sequence, id=name)
244
245    def keys(self):
246        """Return a list with the names of the sequences in the file."""
247        return self.sequences.keys()
248
249    def __len__(self):
250        return len(self.sequences)
251