1# Copyright 2020 by Michiel de Hoon 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format. 8 9This parser reads the index stored in the twoBit file, as well as the masked 10regions and the N's for each sequence. It also creates sequence data objects 11(_TwoBitSequenceData objects), which support only two methods: __len__ and 12__getitem__. The former will return the length of the sequence, while the 13latter returns the sequence (as a bytes object) for the requested region. 14 15Using the information in the index, the __getitem__ method calculates the file 16position at which the requested region starts, and only reads the requested 17sequence region. Note that the full sequence of a record is loaded only if 18specifically requested, making the parser memory-efficient. 19 20The TwoBitIterator object implements the __getitem__, keys, and __len__ 21methods that allow it to be used as a dictionary. 22""" 23# The .2bit file format is defined by UCSC as follows 24# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7): 25# 26# 27# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact 28# randomly-accessible format. The file contains masking information as well 29# as the DNA itself. 30# 31# The file begins with a 16-byte header containing the following fields: 32# 33# signature - the number 0x1A412743 in the architecture of the machine that 34# created the file 35# version - zero for now. Readers should abort if they see a version number 36# higher than 0 37# sequenceCount - the number of sequences in the file 38# reserved - always zero for now 39# 40# All fields are 32 bits unless noted. If the signature value is not as 41# given, the reader program should byte-swap the signature and check if the 42# swapped version matches. If so, all multiple-byte entities in the file 43# will have to be byte-swapped. This enables these binary files to be used 44# unchanged on different architectures. 45# 46# The header is followed by a file index, which contains one entry for each 47# sequence. Each index entry contains three fields: 48# 49# nameSize - a byte containing the length of the name field 50# name - the sequence name itself (in ASCII-compatible byte string), of 51# variable length depending on nameSize 52# offset - the 32-bit offset of the sequence data relative to the start of 53# the file, not aligned to any 4-byte padding boundary 54# 55# The index is followed by the sequence records, which contain nine fields: 56# 57# dnaSize - number of bases of DNA in the sequence 58# nBlockCount - the number of blocks of Ns in the file (representing unknown 59# sequence) 60# nBlockStarts - an array of length nBlockCount of 32 bit integers 61# indicating the (0-based) starting position of a block of Ns 62# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating 63# the length of a block of Ns 64# maskBlockCount - the number of masked (lower-case) blocks 65# maskBlockStarts - an array of length maskBlockCount of 32 bit integers 66# indicating the (0-based) starting position of a masked block 67# maskBlockSizes - an array of length maskBlockCount of 32 bit integers 68# indicating the length of a masked block 69# reserved - always zero for now 70# packedDna - the DNA packed to two bits per base, represented as so: 71# T - 00, C - 01, A - 10, G - 11. The first base is in the most 72# significant 2-bit byte; the last base is in the least significan 73# 2 bits. For example, the sequence TCAG is represented as 00011011. 74import numpy 75 76from Bio.Seq import Seq 77from Bio.Seq import SequenceDataAbstractBaseClass 78from Bio.SeqRecord import SeqRecord 79 80from . import _twoBitIO 81from .Interfaces import SequenceIterator 82 83 84class _TwoBitSequenceData(SequenceDataAbstractBaseClass): 85 """Stores information needed to retrieve sequence data from a .2bit file (PRIVATE). 86 87 Objects of this class store the file position at which the sequence data 88 start, the sequence length, and the start and end position of unknown (N) 89 and masked (lowercase) letters in the sequence. 90 91 Only two methods are provided: __len__ and __getitem__. The former will 92 return the length of the sequence, while the latter returns the sequence 93 (as a bytes object) for the requested region. The full sequence of a record 94 is loaded only if explicitly requested. 95 """ 96 97 __slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks") 98 99 def __init__(self, stream, offset, length): 100 """Initialize the file stream and file position of the sequence data.""" 101 self.stream = stream 102 self.offset = offset 103 self.length = length 104 super().__init__() 105 106 def __getitem__(self, key): 107 length = self.length 108 if isinstance(key, slice): 109 start, end, step = key.indices(length) 110 size = len(range(start, end, step)) 111 if size == 0: 112 return b"" 113 else: 114 if key < 0: 115 key += length 116 if key < 0: 117 raise IndexError("index out of range") 118 start = key 119 end = key + 1 120 step = 1 121 size = 1 122 byteStart = start // 4 123 byteEnd = (end + 3) // 4 124 byteSize = byteEnd - byteStart 125 stream = self.stream 126 try: 127 stream.seek(self.offset + byteStart) 128 except ValueError as exception: 129 if str(exception) == "seek of closed file": 130 raise ValueError("cannot retrieve sequence: file is closed") from None 131 raise 132 data = numpy.fromfile(stream, dtype="uint8", count=byteSize) 133 sequence = _twoBitIO.convert( 134 data, start, end, step, self.nBlocks, self.maskBlocks 135 ) 136 if isinstance(key, slice): 137 return sequence 138 else: # single nucleotide 139 return ord(sequence) 140 141 def __len__(self): 142 return self.length 143 144 def upper(self): 145 """Remove the sequence mask.""" 146 data = _TwoBitSequenceData(self.stream, self.offset, self.length) 147 data.nBlocks = self.nBlocks[:, :] 148 data.maskBlocks = numpy.empty((0, 2), dtype="uint32") 149 return data 150 151 def lower(self): 152 """Extend the sequence mask to the full sequence.""" 153 data = _TwoBitSequenceData(self.stream, self.offset, self.length) 154 data.nBlocks = self.nBlocks[:, :] 155 data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32") 156 return data 157 158 159class TwoBitIterator(SequenceIterator): 160 """Parser for UCSC twoBit (.2bit) files.""" 161 162 def __init__(self, source): 163 """Read the file index.""" 164 super().__init__(source, mode="b", fmt="twoBit") 165 # wait to close the file until the TwoBitIterator goes out of scope: 166 self.should_close_stream = False 167 stream = self.stream 168 data = stream.read(4) 169 if not data: 170 raise ValueError("Empty file.") 171 byteorders = ("little", "big") 172 dtypes = ("<u4", ">u4") 173 for byteorder, dtype in zip(byteorders, dtypes): 174 signature = int.from_bytes(data, byteorder) 175 if signature == 0x1A412743: 176 break 177 else: 178 raise ValueError("Unknown signature") 179 self.byteorder = byteorder 180 data = stream.read(4) 181 version = int.from_bytes(data, byteorder, signed=False) 182 if version == 1: 183 raise ValueError( 184 "version-1 twoBit files with 64-bit offsets for index are currently not supported" 185 ) 186 if version != 0: 187 raise ValueError("Found unexpected file version %u; aborting" % version) 188 data = stream.read(4) 189 sequenceCount = int.from_bytes(data, byteorder, signed=False) 190 data = stream.read(4) 191 reserved = int.from_bytes(data, byteorder, signed=False) 192 if reserved != 0: 193 raise ValueError("Found non-zero reserved field; aborting") 194 sequences = {} 195 for i in range(sequenceCount): 196 data = stream.read(1) 197 nameSize = int.from_bytes(data, byteorder, signed=False) 198 data = stream.read(nameSize) 199 name = data.decode("ASCII") 200 data = stream.read(4) 201 offset = int.from_bytes(data, byteorder, signed=False) 202 sequences[name] = (stream, offset) 203 self.sequences = sequences 204 for name, (stream, offset) in sequences.items(): 205 stream.seek(offset) 206 data = stream.read(4) 207 dnaSize = int.from_bytes(data, byteorder, signed=False) 208 sequence = _TwoBitSequenceData(stream, offset, dnaSize) 209 data = stream.read(4) 210 nBlockCount = int.from_bytes(data, byteorder, signed=False) 211 nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount) 212 nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount) 213 sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32") 214 sequence.nBlocks[:, 0] = nBlockStarts 215 sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes 216 data = stream.read(4) 217 maskBlockCount = int.from_bytes(data, byteorder, signed=False) 218 maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount) 219 maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount) 220 sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32") 221 sequence.maskBlocks[:, 0] = maskBlockStarts 222 sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes 223 data = stream.read(4) 224 reserved = int.from_bytes(data, byteorder, signed=False) 225 if reserved != 0: 226 raise ValueError("Found non-zero reserved field %u" % reserved) 227 sequence.offset = stream.tell() 228 sequences[name] = sequence 229 230 def parse(self, stream): 231 """Iterate over the sequences in the file.""" 232 for name, sequence in self.sequences.items(): 233 sequence = Seq(sequence) 234 record = SeqRecord(sequence, id=name) 235 yield record 236 237 def __getitem__(self, name): 238 try: 239 sequence = self.sequences[name] 240 except ValueError: 241 raise KeyError(name) from None 242 sequence = Seq(sequence) 243 return SeqRecord(sequence, id=name) 244 245 def keys(self): 246 """Return a list with the names of the sequences in the file.""" 247 return self.sequences.keys() 248 249 def __len__(self): 250 return len(self.sequences) 251