1# Copyright 2009 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28from array import array 29from copy import copy 30from struct import calcsize 31 32from whoosh.compat import BytesIO, bytes_type 33from whoosh.compat import dump as dump_pickle 34from whoosh.compat import load as load_pickle 35from whoosh.compat import array_frombytes, array_tobytes 36from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE 37from whoosh.system import IS_LITTLE 38from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte 39from whoosh.system import pack_ushort, unpack_ushort 40from whoosh.system import pack_ushort_le, unpack_ushort_le 41from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint 42from whoosh.system import pack_uint_le, unpack_uint_le 43from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong 44from whoosh.system import pack_float, unpack_float 45from whoosh.util.varints import varint, read_varint 46from whoosh.util.varints import signed_varint, decode_signed_varint 47 48 49_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") 50_ORDERMAP = {"little": "<", "big": ">"} 51 52_types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"), 53 ("long", "q"), ("float", "f")) 54 55 56# Main function 57 58class StructFile(object): 59 """Returns a "structured file" object that wraps the given file object and 60 provides numerous additional methods for writing structured data, such as 61 "write_varint" and "write_long". 62 """ 63 64 def __init__(self, fileobj, name=None, onclose=None): 65 self.file = fileobj 66 self._name = name 67 self.onclose = onclose 68 self.is_closed = False 69 70 self.is_real = hasattr(fileobj, "fileno") 71 if self.is_real: 72 self.fileno = fileobj.fileno 73 74 def __repr__(self): 75 return "%s(%r)" % (self.__class__.__name__, self._name) 76 77 def __str__(self): 78 return self._name 79 80 def __enter__(self): 81 return self 82 83 def __exit__(self, exc_type, exc_val, exc_tb): 84 self.close() 85 86 def __iter__(self): 87 return iter(self.file) 88 89 def raw_file(self): 90 return self.file 91 92 def read(self, *args, **kwargs): 93 return self.file.read(*args, **kwargs) 94 95 def readline(self, *args, **kwargs): 96 return self.file.readline(*args, **kwargs) 97 98 def write(self, *args, **kwargs): 99 return self.file.write(*args, **kwargs) 100 101 def tell(self, *args, **kwargs): 102 return self.file.tell(*args, **kwargs) 103 104 def seek(self, *args, **kwargs): 105 return self.file.seek(*args, **kwargs) 106 107 def truncate(self, *args, **kwargs): 108 return self.file.truncate(*args, **kwargs) 109 110 def flush(self): 111 """Flushes the buffer of the wrapped file. This is a no-op if the 112 wrapped file does not have a flush method. 113 """ 114 115 if hasattr(self.file, "flush"): 116 self.file.flush() 117 118 def close(self): 119 """Closes the wrapped file. 120 """ 121 122 if self.is_closed: 123 raise Exception("This file is already closed") 124 if self.onclose: 125 self.onclose(self) 126 if hasattr(self.file, "close"): 127 self.file.close() 128 self.is_closed = True 129 130 def subset(self, offset, length, name=None): 131 from whoosh.filedb.compound import SubFile 132 133 name = name or self._name 134 return StructFile(SubFile(self.file, offset, length), name=name) 135 136 def write_string(self, s): 137 """Writes a string to the wrapped file. This method writes the length 138 of the string first, so you can read the string back without having to 139 know how long it was. 140 """ 141 self.write_varint(len(s)) 142 self.write(s) 143 144 def write_string2(self, s): 145 self.write(pack_ushort(len(s)) + s) 146 147 def write_string4(self, s): 148 self.write(pack_int(len(s)) + s) 149 150 def read_string(self): 151 """Reads a string from the wrapped file. 152 """ 153 return self.read(self.read_varint()) 154 155 def read_string2(self): 156 l = self.read_ushort() 157 return self.read(l) 158 159 def read_string4(self): 160 l = self.read_int() 161 return self.read(l) 162 163 def get_string2(self, pos): 164 l = self.get_ushort(pos) 165 base = pos + _SHORT_SIZE 166 return self.get(base, l), base + l 167 168 def get_string4(self, pos): 169 l = self.get_int(pos) 170 base = pos + _INT_SIZE 171 return self.get(base, l), base + l 172 173 def skip_string(self): 174 l = self.read_varint() 175 self.seek(l, 1) 176 177 def write_varint(self, i): 178 """Writes a variable-length unsigned integer to the wrapped file. 179 """ 180 self.write(varint(i)) 181 182 def write_svarint(self, i): 183 """Writes a variable-length signed integer to the wrapped file. 184 """ 185 self.write(signed_varint(i)) 186 187 def read_varint(self): 188 """Reads a variable-length encoded unsigned integer from the wrapped 189 file. 190 """ 191 return read_varint(self.read) 192 193 def read_svarint(self): 194 """Reads a variable-length encoded signed integer from the wrapped 195 file. 196 """ 197 return decode_signed_varint(read_varint(self.read)) 198 199 def write_tagint(self, i): 200 """Writes a sometimes-compressed unsigned integer to the wrapped file. 201 This is similar to the varint methods but uses a less compressed but 202 faster format. 203 """ 204 205 # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit 206 # int follows." Byte 255 means "An unsigned 32-bit int follows." 207 if i <= 253: 208 self.write(chr(i)) 209 elif i <= 65535: 210 self.write("\xFE" + pack_ushort(i)) 211 else: 212 self.write("\xFF" + pack_uint(i)) 213 214 def read_tagint(self): 215 """Reads a sometimes-compressed unsigned integer from the wrapped file. 216 This is similar to the varint methods but uses a less compressed but 217 faster format. 218 """ 219 220 tb = ord(self.read(1)) 221 if tb == 254: 222 return self.read_ushort() 223 elif tb == 255: 224 return self.read_uint() 225 else: 226 return tb 227 228 def write_byte(self, n): 229 """Writes a single byte to the wrapped file, shortcut for 230 ``file.write(chr(n))``. 231 """ 232 self.write(pack_byte(n)) 233 234 def read_byte(self): 235 return ord(self.read(1)) 236 237 def write_pickle(self, obj, protocol=-1): 238 """Writes a pickled representation of obj to the wrapped file. 239 """ 240 dump_pickle(obj, self.file, protocol) 241 242 def read_pickle(self): 243 """Reads a pickled object from the wrapped file. 244 """ 245 return load_pickle(self.file) 246 247 def write_sbyte(self, n): 248 self.write(pack_sbyte(n)) 249 250 def write_int(self, n): 251 self.write(pack_int(n)) 252 253 def write_uint(self, n): 254 self.write(pack_uint(n)) 255 256 def write_uint_le(self, n): 257 self.write(pack_uint_le(n)) 258 259 def write_ushort(self, n): 260 self.write(pack_ushort(n)) 261 262 def write_ushort_le(self, n): 263 self.write(pack_ushort_le(n)) 264 265 def write_long(self, n): 266 self.write(pack_long(n)) 267 268 def write_ulong(self, n): 269 self.write(pack_ulong(n)) 270 271 def write_float(self, n): 272 self.write(pack_float(n)) 273 274 def write_array(self, arry): 275 if IS_LITTLE: 276 arry = copy(arry) 277 arry.byteswap() 278 if self.is_real: 279 arry.tofile(self.file) 280 else: 281 self.write(array_tobytes(arry)) 282 283 def read_sbyte(self): 284 return unpack_sbyte(self.read(1))[0] 285 286 def read_int(self): 287 return unpack_int(self.read(_INT_SIZE))[0] 288 289 def read_uint(self): 290 return unpack_uint(self.read(_INT_SIZE))[0] 291 292 def read_uint_le(self): 293 return unpack_uint_le(self.read(_INT_SIZE))[0] 294 295 def read_ushort(self): 296 return unpack_ushort(self.read(_SHORT_SIZE))[0] 297 298 def read_ushort_le(self): 299 return unpack_ushort_le(self.read(_SHORT_SIZE))[0] 300 301 def read_long(self): 302 return unpack_long(self.read(_LONG_SIZE))[0] 303 304 def read_ulong(self): 305 return unpack_ulong(self.read(_LONG_SIZE))[0] 306 307 def read_float(self): 308 return unpack_float(self.read(_FLOAT_SIZE))[0] 309 310 def read_array(self, typecode, length): 311 a = array(typecode) 312 if self.is_real: 313 a.fromfile(self.file, length) 314 else: 315 array_frombytes(a, self.read(length * _SIZEMAP[typecode])) 316 if IS_LITTLE: 317 a.byteswap() 318 return a 319 320 def get(self, position, length): 321 self.seek(position) 322 return self.read(length) 323 324 def get_byte(self, position): 325 return unpack_byte(self.get(position, 1))[0] 326 327 def get_sbyte(self, position): 328 return unpack_sbyte(self.get(position, 1))[0] 329 330 def get_int(self, position): 331 return unpack_int(self.get(position, _INT_SIZE))[0] 332 333 def get_uint(self, position): 334 return unpack_uint(self.get(position, _INT_SIZE))[0] 335 336 def get_ushort(self, position): 337 return unpack_ushort(self.get(position, _SHORT_SIZE))[0] 338 339 def get_long(self, position): 340 return unpack_long(self.get(position, _LONG_SIZE))[0] 341 342 def get_ulong(self, position): 343 return unpack_ulong(self.get(position, _LONG_SIZE))[0] 344 345 def get_float(self, position): 346 return unpack_float(self.get(position, _FLOAT_SIZE))[0] 347 348 def get_array(self, position, typecode, length): 349 self.seek(position) 350 return self.read_array(typecode, length) 351 352 353class BufferFile(StructFile): 354 def __init__(self, buf, name=None, onclose=None): 355 self._buf = buf 356 self._name = name 357 self.file = BytesIO(buf) 358 self.onclose = onclose 359 360 self.is_real = False 361 self.is_closed = False 362 363 def subset(self, position, length, name=None): 364 name = name or self._name 365 return BufferFile(self.get(position, length), name=name) 366 367 def get(self, position, length): 368 return bytes_type(self._buf[position:position + length]) 369 370 def get_array(self, position, typecode, length): 371 a = array(typecode) 372 array_frombytes(a, self.get(position, length * _SIZEMAP[typecode])) 373 if IS_LITTLE: 374 a.byteswap() 375 return a 376 377 378class ChecksumFile(StructFile): 379 def __init__(self, *args, **kwargs): 380 StructFile.__init__(self, *args, **kwargs) 381 self._check = 0 382 self._crc32 = __import__("zlib").crc32 383 384 def __iter__(self): 385 for line in self.file: 386 self._check = self._crc32(line, self._check) 387 yield line 388 389 def seek(self, *args): 390 raise Exception("Cannot seek on a ChecksumFile") 391 392 def read(self, *args, **kwargs): 393 b = self.file.read(*args, **kwargs) 394 self._check = self._crc32(b, self._check) 395 return b 396 397 def write(self, b): 398 self._check = self._crc32(b, self._check) 399 self.file.write(b) 400 401 def checksum(self): 402 return self._check & 0xffffffff 403