1''' 2Support for reading LIT files. 3''' 4 5__license__ = 'GPL v3' 6__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ 7 'and Marshall T. Vandegrift <llasram@gmail.com>' 8 9import io, struct, os, functools, re 10 11from lxml import etree 12 13from calibre.ebooks.lit import LitError 14from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP 15import calibre.ebooks.lit.mssha1 as mssha1 16from calibre.ebooks.oeb.base import urlnormalize, xpath 17from calibre.ebooks.oeb.reader import OEBReader 18from calibre.ebooks import DRMError 19from polyglot.builtins import codepoint_to_chr, string_or_bytes, itervalues 20from polyglot.urllib import unquote as urlunquote, urldefrag 21from calibre_extensions import lzx, msdes 22 23__all__ = ["LitReader"] 24 25XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?> 26""" 27OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?> 28<!DOCTYPE package 29 PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN" 30 "http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd"> 31""" 32HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?> 33<!DOCTYPE html PUBLIC 34 "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN" 35 "http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd"> 36""" 37 38DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" 39LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" 40 41CONTROL_TAG = 4 42CONTROL_WINDOW_SIZE = 12 43RESET_NENTRIES = 4 44RESET_HDRLEN = 12 45RESET_UCLENGTH = 16 46RESET_INTERVAL = 32 47 48FLAG_OPENING = (1 << 0) 49FLAG_CLOSING = (1 << 1) 50FLAG_BLOCK = (1 << 2) 51FLAG_HEAD = (1 << 3) 52FLAG_ATOM = (1 << 4) 53 54 55def u32(bytes): 56 return struct.unpack('<L', bytes[:4])[0] 57 58 59def u16(bytes): 60 return struct.unpack('<H', bytes[:2])[0] 61 62 63def int32(bytes): 64 return struct.unpack('<l', bytes[:4])[0] 65 66 67def encint(byts, remaining): 68 pos, val = 0, 0 69 ba = bytearray(byts) 70 while remaining > 0: 71 b = ba[pos] 72 pos += 1 73 remaining -= 1 74 val <<= 7 75 val |= (b & 0x7f) 76 if b & 0x80 == 0: 77 break 78 return val, byts[pos:], remaining 79 80 81def msguid(bytes): 82 values = struct.unpack("<LHHBBBBBBBB", bytes[:16]) 83 return "{%08lX-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X}" % values 84 85 86def read_utf8_char(bytes, pos): 87 c = ord(bytes[pos:pos+1]) 88 mask = 0x80 89 if (c & mask): 90 elsize = 0 91 while c & mask: 92 mask >>= 1 93 elsize += 1 94 if (mask <= 1) or (mask == 0x40): 95 raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) 96 else: 97 elsize = 1 98 if elsize > 1: 99 if elsize + pos > len(bytes): 100 raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) 101 c &= (mask - 1) 102 for i in range(1, elsize): 103 b = ord(bytes[pos+i:pos+i+1]) 104 if (b & 0xC0) != 0x80: 105 raise LitError( 106 'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i])) 107 c = (c << 6) | (b & 0x3F) 108 return codepoint_to_chr(c), pos+elsize 109 110 111def consume_sized_utf8_string(bytes, zpad=False): 112 result = [] 113 slen, pos = read_utf8_char(bytes, 0) 114 for i in range(ord(slen)): 115 char, pos = read_utf8_char(bytes, pos) 116 result.append(char) 117 if zpad and bytes[pos:pos+1] == b'\0': 118 pos += 1 119 return ''.join(result), bytes[pos:] 120 121 122def encode(string): 123 return str(string).encode('ascii', 'xmlcharrefreplace') 124 125 126class UnBinary: 127 AMPERSAND_RE = re.compile( 128 br'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') 129 OPEN_ANGLE_RE = re.compile(br'<<(?![!]--)') 130 CLOSE_ANGLE_RE = re.compile(br'(?<!--)>>(?=>>|[^>])') 131 DOUBLE_ANGLE_RE = re.compile(br'([<>])\1') 132 EMPTY_ATOMS = ({},{}) 133 134 def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS): 135 self.manifest = manifest 136 self.tag_map, self.attr_map, self.tag_to_attr_map = map 137 self.is_html = map is HTML_MAP 138 self.tag_atoms, self.attr_atoms = atoms 139 self.dir = os.path.dirname(path) 140 buf = io.BytesIO() 141 self.binary_to_text(bin, buf) 142 self.raw = buf.getvalue().lstrip() 143 self.escape_reserved() 144 self._tree = None 145 146 def escape_reserved(self): 147 raw = self.raw 148 raw = self.AMPERSAND_RE.sub(br'&', raw) 149 raw = self.OPEN_ANGLE_RE.sub(br'<', raw) 150 raw = self.CLOSE_ANGLE_RE.sub(br'>', raw) 151 raw = self.DOUBLE_ANGLE_RE.sub(br'\1', raw) 152 self.raw = raw 153 154 def item_path(self, internal_id): 155 try: 156 target = self.manifest[internal_id].path 157 except KeyError: 158 return internal_id 159 if not self.dir: 160 return target 161 target = target.split('/') 162 base = self.dir.split('/') 163 for index in range(min(len(base), len(target))): 164 if base[index] != target[index]: 165 break 166 else: 167 index += 1 168 relpath = (['..'] * (len(base) - index)) + target[index:] 169 return '/'.join(relpath) 170 171 @property 172 def binary_representation(self): 173 return self.raw 174 175 @property 176 def unicode_representation(self): 177 return self.raw.decode('utf-8') 178 179 def __unicode__(self): 180 return self.unicode_representation 181 182 def __str__(self): 183 return self.unicode_representation 184 185 def binary_to_text(self, bin, buf): 186 stack = [(0, None, None, 0, 0, False, False, 'text', 0)] 187 self.cpos = 0 188 while stack: 189 self.binary_to_text_inner(bin, buf, stack) 190 del self.cpos 191 192 def binary_to_text_inner(self, bin, buf, stack): 193 (depth, tag_name, current_map, dynamic_tag, errors, 194 in_censorship, is_goingdown, state, flags) = stack.pop() 195 196 if state == 'close tag': 197 if not tag_name: 198 raise LitError('Tag ends before it begins.') 199 buf.write(encode(''.join(('</', tag_name, '>')))) 200 dynamic_tag = 0 201 tag_name = None 202 state = 'text' 203 204 while self.cpos < len(bin): 205 c, self.cpos = read_utf8_char(bin, self.cpos) 206 oc = ord(c) 207 208 if state == 'text': 209 if oc == 0: 210 state = 'get flags' 211 continue 212 elif c == '\v': 213 c = '\n' 214 elif c == '>': 215 c = '>>' 216 elif c == '<': 217 c = '<<' 218 buf.write(encode(c)) 219 220 elif state == 'get flags': 221 if oc == 0: 222 state = 'text' 223 continue 224 flags = oc 225 state = 'get tag' 226 227 elif state == 'get tag': 228 state = 'text' if oc == 0 else 'get attr' 229 if flags & FLAG_OPENING: 230 tag = oc 231 buf.write(b'<') 232 if not (flags & FLAG_CLOSING): 233 is_goingdown = True 234 if tag == 0x8000: 235 state = 'get custom length' 236 continue 237 if flags & FLAG_ATOM: 238 if not self.tag_atoms or tag not in self.tag_atoms: 239 raise LitError( 240 "atom tag %d not in atom tag list" % tag) 241 tag_name = self.tag_atoms[tag] 242 current_map = self.attr_atoms 243 elif tag < len(self.tag_map): 244 tag_name = self.tag_map[tag] 245 current_map = self.tag_to_attr_map[tag] 246 else: 247 dynamic_tag += 1 248 errors += 1 249 tag_name = '?'+codepoint_to_chr(tag)+'?' 250 current_map = self.tag_to_attr_map[tag] 251 print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) 252 buf.write(encode(tag_name)) 253 elif flags & FLAG_CLOSING: 254 if depth == 0: 255 raise LitError('Extra closing tag %s at %d'%(tag_name, 256 self.cpos)) 257 break 258 259 elif state == 'get attr': 260 in_censorship = False 261 if oc == 0: 262 state = 'text' 263 if not is_goingdown: 264 tag_name = None 265 dynamic_tag = 0 266 buf.write(b' />') 267 else: 268 buf.write(b'>') 269 frame = (depth, tag_name, current_map, 270 dynamic_tag, errors, in_censorship, False, 271 'close tag', flags) 272 stack.append(frame) 273 frame = (depth+1, None, None, 0, 0, 274 False, False, 'text', 0) 275 stack.append(frame) 276 break 277 else: 278 if oc == 0x8000: 279 state = 'get attr length' 280 continue 281 attr = None 282 if current_map and oc in current_map and current_map[oc]: 283 attr = current_map[oc] 284 elif oc in self.attr_map: 285 attr = self.attr_map[oc] 286 if not attr or not isinstance(attr, string_or_bytes): 287 raise LitError( 288 'Unknown attribute %d in tag %s' % (oc, tag_name)) 289 if attr.startswith('%'): 290 in_censorship = True 291 state = 'get value length' 292 continue 293 buf.write(b' ' + encode(attr) + b'=') 294 if attr in ['href', 'src']: 295 state = 'get href length' 296 else: 297 state = 'get value length' 298 299 elif state == 'get value length': 300 if not in_censorship: 301 buf.write(b'"') 302 count = oc - 1 303 if count == 0: 304 if not in_censorship: 305 buf.write(b'"') 306 in_censorship = False 307 state = 'get attr' 308 continue 309 state = 'get value' 310 if oc == 0xffff: 311 continue 312 if count < 0 or count > (len(bin) - self.cpos): 313 raise LitError('Invalid character count %d' % count) 314 315 elif state == 'get value': 316 if count == 0xfffe: 317 if not in_censorship: 318 buf.write(encode('%s"' % (oc - 1))) 319 in_censorship = False 320 state = 'get attr' 321 elif count > 0: 322 if not in_censorship: 323 if c == '"': 324 c = '"' 325 elif c == '<': 326 c = '<' 327 if isinstance(c, str): 328 c = c.encode('ascii', 'xmlcharrefreplace') 329 buf.write(c) 330 count -= 1 331 if count == 0: 332 if not in_censorship: 333 buf.write(b'"') 334 in_censorship = False 335 state = 'get attr' 336 337 elif state == 'get custom length': 338 count = oc - 1 339 if count <= 0 or count > len(bin)-self.cpos: 340 raise LitError('Invalid character count %d' % count) 341 dynamic_tag += 1 342 state = 'get custom' 343 tag_name = '' 344 345 elif state == 'get custom': 346 tag_name += c 347 count -= 1 348 if count == 0: 349 buf.write(encode(tag_name)) 350 state = 'get attr' 351 352 elif state == 'get attr length': 353 count = oc - 1 354 if count <= 0 or count > (len(bin) - self.cpos): 355 raise LitError('Invalid character count %d' % count) 356 buf.write(b' ') 357 state = 'get custom attr' 358 359 elif state == 'get custom attr': 360 buf.write(encode(c)) 361 count -= 1 362 if count == 0: 363 buf.write(b'=') 364 state = 'get value length' 365 366 elif state == 'get href length': 367 count = oc - 1 368 if count <= 0 or count > (len(bin) - self.cpos): 369 raise LitError('Invalid character count %d' % count) 370 href = '' 371 state = 'get href' 372 373 elif state == 'get href': 374 href += c 375 count -= 1 376 if count == 0: 377 doc, frag = urldefrag(href[1:]) 378 path = self.item_path(doc) 379 if frag: 380 path = '#'.join((path, frag)) 381 path = urlnormalize(path) 382 buf.write(encode('"%s"' % path)) 383 state = 'get attr' 384 385 386class DirectoryEntry: 387 388 def __init__(self, name, section, offset, size): 389 self.name = name 390 self.section = section 391 self.offset = offset 392 self.size = size 393 394 def __repr__(self): 395 return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \ 396 % (repr(self.name), self.section, self.offset, self.size) 397 398 def __str__(self): 399 return repr(self) 400 401 402class ManifestItem: 403 404 def __init__(self, original, internal, mime_type, offset, root, state): 405 self.original = original 406 self.internal = internal 407 self.mime_type = mime_type.lower() if hasattr(mime_type, 'lower') else mime_type 408 self.offset = offset 409 self.root = root 410 self.state = state 411 # Some LIT files have Windows-style paths 412 path = original.replace('\\', '/') 413 if path[1:3] == ':/': 414 path = path[2:] 415 # Some paths in Fictionwise "multiformat" LIT files contain '..' (!?) 416 path = os.path.normpath(path).replace('\\', '/') 417 while path.startswith('../'): 418 path = path[3:] 419 self.path = path 420 421 def __eq__(self, other): 422 if hasattr(other, 'internal'): 423 return self.internal == other.internal 424 return self.internal == other 425 426 def __repr__(self): 427 return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \ 428 "offset=%d, root=%r, state=%r)" \ 429 % (self.internal, self.path, self.mime_type, self.offset, 430 self.root, self.state) 431 432 433def preserve(function): 434 def wrapper(self, *args, **kwargs): 435 opos = self.stream.tell() 436 try: 437 return function(self, *args, **kwargs) 438 finally: 439 self.stream.seek(opos) 440 functools.update_wrapper(wrapper, function) 441 return wrapper 442 443 444class LitFile: 445 PIECE_SIZE = 16 446 447 def __init__(self, filename_or_stream, log): 448 self._warn = log.warn 449 if hasattr(filename_or_stream, 'read'): 450 self.stream = filename_or_stream 451 else: 452 self.stream = open(filename_or_stream, 'rb') 453 try: 454 self.opf_path = os.path.splitext( 455 os.path.basename(self.stream.name))[0] + '.opf' 456 except AttributeError: 457 self.opf_path = 'content.opf' 458 if self.magic != b'ITOLITLS': 459 raise LitError('Not a valid LIT file') 460 if self.version != 1: 461 raise LitError('Unknown LIT version %d' % (self.version,)) 462 self.read_secondary_header() 463 self.read_header_pieces() 464 self.read_section_names() 465 self.read_manifest() 466 self.read_drm() 467 468 def warn(self, msg): 469 self._warn(msg) 470 471 def magic(): 472 @preserve 473 def fget(self): 474 self.stream.seek(0) 475 return self.stream.read(8) 476 return property(fget=fget) 477 magic = magic() 478 479 def version(): 480 def fget(self): 481 self.stream.seek(8) 482 return u32(self.stream.read(4)) 483 return property(fget=fget) 484 version = version() 485 486 def hdr_len(): 487 @preserve 488 def fget(self): 489 self.stream.seek(12) 490 return int32(self.stream.read(4)) 491 return property(fget=fget) 492 hdr_len = hdr_len() 493 494 def num_pieces(): 495 @preserve 496 def fget(self): 497 self.stream.seek(16) 498 return int32(self.stream.read(4)) 499 return property(fget=fget) 500 num_pieces = num_pieces() 501 502 def sec_hdr_len(): 503 @preserve 504 def fget(self): 505 self.stream.seek(20) 506 return int32(self.stream.read(4)) 507 return property(fget=fget) 508 sec_hdr_len = sec_hdr_len() 509 510 def guid(): 511 @preserve 512 def fget(self): 513 self.stream.seek(24) 514 return self.stream.read(16) 515 return property(fget=fget) 516 guid = guid() 517 518 def header(): 519 @preserve 520 def fget(self): 521 size = self.hdr_len \ 522 + (self.num_pieces * self.PIECE_SIZE) \ 523 + self.sec_hdr_len 524 self.stream.seek(0) 525 return self.stream.read(size) 526 return property(fget=fget) 527 header = header() 528 529 @preserve 530 def __len__(self): 531 self.stream.seek(0, 2) 532 return self.stream.tell() 533 534 @preserve 535 def read_raw(self, offset, size): 536 self.stream.seek(offset) 537 return self.stream.read(size) 538 539 def read_content(self, offset, size): 540 return self.read_raw(self.content_offset + offset, size) 541 542 def read_secondary_header(self): 543 offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) 544 byts = self.read_raw(offset, self.sec_hdr_len) 545 offset = int32(byts[4:]) 546 while offset < len(byts): 547 blocktype = byts[offset:offset+4] 548 blockver = u32(byts[offset+4:]) 549 if blocktype == b'CAOL': 550 if blockver != 2: 551 raise LitError( 552 'Unknown CAOL block format %d' % blockver) 553 self.creator_id = u32(byts[offset+12:]) 554 self.entry_chunklen = u32(byts[offset+20:]) 555 self.count_chunklen = u32(byts[offset+24:]) 556 self.entry_unknown = u32(byts[offset+28:]) 557 self.count_unknown = u32(byts[offset+32:]) 558 offset += 48 559 elif blocktype == b'ITSF': 560 if blockver != 4: 561 raise LitError( 562 'Unknown ITSF block format %d' % blockver) 563 if u32(byts[offset+4+16:]): 564 raise LitError('This file has a 64bit content offset') 565 self.content_offset = u32(byts[offset+16:]) 566 self.timestamp = u32(byts[offset+24:]) 567 self.language_id = u32(byts[offset+28:]) 568 offset += 48 569 if not hasattr(self, 'content_offset'): 570 raise LitError('Could not figure out the content offset') 571 572 def read_header_pieces(self): 573 src = self.header[self.hdr_len:] 574 for i in range(self.num_pieces): 575 piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] 576 if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: 577 raise LitError('Piece %s has 64bit value' % repr(piece)) 578 offset, size = u32(piece), int32(piece[8:]) 579 piece = self.read_raw(offset, size) 580 if i == 0: 581 continue # Dont need this piece 582 elif i == 1: 583 if u32(piece[8:]) != self.entry_chunklen or \ 584 u32(piece[12:]) != self.entry_unknown: 585 raise LitError('Secondary header does not match piece') 586 self.read_directory(piece) 587 elif i == 2: 588 if u32(piece[8:]) != self.count_chunklen or \ 589 u32(piece[12:]) != self.count_unknown: 590 raise LitError('Secondary header does not match piece') 591 continue # No data needed from this piece 592 elif i == 3: 593 self.piece3_guid = piece 594 elif i == 4: 595 self.piece4_guid = piece 596 597 def read_directory(self, piece): 598 if not piece.startswith(b'IFCM'): 599 raise LitError('Header piece #1 is not main directory.') 600 chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) 601 if (32 + (num_chunks * chunk_size)) != len(piece): 602 raise LitError('IFCM header has incorrect length') 603 self.entries = {} 604 for i in range(num_chunks): 605 offset = 32 + (i * chunk_size) 606 chunk = piece[offset:offset + chunk_size] 607 tag, chunk = chunk[:4], chunk[4:] 608 if tag != b'AOLL': 609 continue 610 remaining, chunk = int32(chunk[:4]), chunk[4:] 611 if remaining >= chunk_size: 612 raise LitError('AOLL remaining count is negative') 613 remaining = chunk_size - (remaining + 48) 614 entries = u16(chunk[-2:]) 615 if entries == 0: 616 # Hopefully will work even without a correct entries count 617 entries = (2 ** 16) - 1 618 chunk = chunk[40:] 619 for j in range(entries): 620 if remaining <= 0: 621 break 622 namelen, chunk, remaining = encint(chunk, remaining) 623 if namelen != (namelen & 0x7fffffff): 624 raise LitError('Directory entry had 64bit name length.') 625 if namelen > remaining - 3: 626 raise LitError('Read past end of directory chunk') 627 try: 628 name = chunk[:namelen].decode('utf-8') 629 chunk = chunk[namelen:] 630 remaining -= namelen 631 except UnicodeDecodeError: 632 break 633 section, chunk, remaining = encint(chunk, remaining) 634 offset, chunk, remaining = encint(chunk, remaining) 635 size, chunk, remaining = encint(chunk, remaining) 636 entry = DirectoryEntry(name, section, offset, size) 637 self.entries[name] = entry 638 639 def read_section_names(self): 640 if '::DataSpace/NameList' not in self.entries: 641 raise LitError('Lit file does not have a valid NameList') 642 raw = self.get_file('::DataSpace/NameList') 643 if len(raw) < 4: 644 raise LitError('Invalid Namelist section') 645 pos = 4 646 num_sections = u16(raw[2:pos]) 647 self.section_names = [""] * num_sections 648 self.section_data = [None] * num_sections 649 for section in range(num_sections): 650 size = u16(raw[pos:pos+2]) 651 pos += 2 652 size = size*2 + 2 653 if pos + size > len(raw): 654 raise LitError('Invalid Namelist section') 655 self.section_names[section] = \ 656 raw[pos:pos+size].decode('utf-16-le').rstrip('\0') 657 pos += size 658 659 def read_manifest(self): 660 if '/manifest' not in self.entries: 661 raise LitError('Lit file does not have a valid manifest') 662 raw = self.get_file('/manifest') 663 self.manifest = {} 664 self.paths = {self.opf_path: None} 665 while raw: 666 slen, raw = ord(raw[0:1]), raw[1:] 667 if slen == 0: 668 break 669 root, raw = raw[:slen].decode('utf8'), raw[slen:] 670 if not raw: 671 raise LitError('Truncated manifest') 672 for state in ['spine', 'not spine', 'css', 'images']: 673 num_files, raw = int32(raw), raw[4:] 674 if num_files == 0: 675 continue 676 for i in range(num_files): 677 if len(raw) < 5: 678 raise LitError('Truncated manifest') 679 offset, raw = u32(raw), raw[4:] 680 internal, raw = consume_sized_utf8_string(raw) 681 original, raw = consume_sized_utf8_string(raw) 682 # The path should be stored unquoted, but not always 683 original = urlunquote(original) 684 # Is this last one UTF-8 or ASCIIZ? 685 mime_type, raw = consume_sized_utf8_string(raw, zpad=True) 686 self.manifest[internal] = ManifestItem( 687 original, internal, mime_type, offset, root, state) 688 mlist = list(itervalues(self.manifest)) 689 # Remove any common path elements 690 if len(mlist) > 1: 691 shared = mlist[0].path 692 for item in mlist[1:]: 693 path = item.path 694 while shared and not path.startswith(shared): 695 try: 696 shared = shared[:shared.rindex("/", 0, -2) + 1] 697 except ValueError: 698 shared = None 699 if not shared: 700 break 701 if shared: 702 slen = len(shared) 703 for item in mlist: 704 item.path = item.path[slen:] 705 # Fix any straggling absolute paths 706 for item in mlist: 707 if item.path[0] == '/': 708 item.path = os.path.basename(item.path) 709 self.paths[item.path] = item 710 711 def read_drm(self): 712 self.drmlevel = 0 713 if '/DRMStorage/Licenses/EUL' in self.entries: 714 self.drmlevel = 5 715 elif '/DRMStorage/DRMBookplate' in self.entries: 716 self.drmlevel = 3 717 elif '/DRMStorage/DRMSealed' in self.entries: 718 self.drmlevel = 1 719 else: 720 return 721 if self.drmlevel < 5: 722 msdes.deskey(self.calculate_deskey(), msdes.DE1) 723 bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) 724 if bookkey[0:1] != b'\0': 725 raise LitError('Unable to decrypt title key!') 726 self.bookkey = bookkey[1:9] 727 else: 728 raise DRMError("Cannot access DRM-protected book") 729 730 def calculate_deskey(self): 731 hashfiles = ['/meta', '/DRMStorage/DRMSource'] 732 if self.drmlevel == 3: 733 hashfiles.append('/DRMStorage/DRMBookplate') 734 prepad = 2 735 hash = mssha1.new() 736 for name in hashfiles: 737 data = self.get_file(name) 738 if prepad > 0: 739 data = (b"\000" * prepad) + data 740 prepad = 0 741 postpad = 64 - (len(data) % 64) 742 if postpad < 64: 743 data = data + (b"\000" * postpad) 744 hash.update(data) 745 digest = hash.digest() 746 if not isinstance(digest, bytes): 747 digest = digest.encode('ascii') 748 digest = bytearray(digest) 749 key = bytearray(8) 750 for i, d in enumerate(digest): 751 key[i % 8] ^= d 752 return bytes(key) 753 754 def get_file(self, name): 755 entry = self.entries[name] 756 if entry.section == 0: 757 return self.read_content(entry.offset, entry.size) 758 section = self.get_section(entry.section) 759 return section[entry.offset:entry.offset+entry.size] 760 761 def get_section(self, section): 762 data = self.section_data[section] 763 if not data: 764 data = self.get_section_uncached(section) 765 self.section_data[section] = data 766 return data 767 768 def get_section_uncached(self, section): 769 name = self.section_names[section] 770 path = '::DataSpace/Storage/' + name 771 transform = self.get_file(path + '/Transform/List') 772 content = self.get_file(path + '/Content') 773 control = self.get_file(path + '/ControlData') 774 while len(transform) >= 16: 775 csize = (int32(control) + 1) * 4 776 if csize > len(control) or csize <= 0: 777 raise LitError("ControlData is too short") 778 guid = msguid(transform) 779 if guid == DESENCRYPT_GUID: 780 content = self.decrypt(content) 781 control = control[csize:] 782 elif guid == LZXCOMPRESS_GUID: 783 reset_table = self.get_file( 784 '/'.join(('::DataSpace/Storage', name, 'Transform', 785 LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) 786 content = self.decompress(content, control, reset_table) 787 control = control[csize:] 788 else: 789 raise LitError("Unrecognized transform: %s." % repr(guid)) 790 transform = transform[16:] 791 return content 792 793 def decrypt(self, content): 794 length = len(content) 795 extra = length & 0x7 796 if extra > 0: 797 self.warn("content length not a multiple of block size") 798 content += b"\0" * (8 - extra) 799 msdes.deskey(self.bookkey, msdes.DE1) 800 return msdes.des(content) 801 802 def decompress(self, content, control, reset_table): 803 if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != b"LZXC": 804 raise LitError("Invalid ControlData tag value") 805 if len(reset_table) < (RESET_INTERVAL + 8): 806 raise LitError("Reset table is too short") 807 if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: 808 raise LitError("Reset table has 64bit value for UCLENGTH") 809 810 result = [] 811 812 window_size = 14 813 u = u32(control[CONTROL_WINDOW_SIZE:]) 814 while u > 0: 815 u >>= 1 816 window_size += 1 817 if window_size < 15 or window_size > 21: 818 raise LitError("Invalid window in ControlData") 819 lzx.init(window_size) 820 821 ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8 822 uclength = int32(reset_table[RESET_UCLENGTH:]) 823 accum = int32(reset_table[RESET_INTERVAL:]) 824 bytes_remaining = uclength 825 window_bytes = (1 << window_size) 826 base = 0 827 828 while ofs_entry < len(reset_table): 829 if accum >= window_bytes: 830 accum = 0 831 size = int32(reset_table[ofs_entry:]) 832 u = int32(reset_table[ofs_entry + 4:]) 833 if u != 0: 834 raise LitError("Reset table entry greater than 32 bits") 835 if size >= len(content): 836 self._warn("LZX reset table entry out of bounds") 837 if bytes_remaining >= window_bytes: 838 lzx.reset() 839 try: 840 result.append( 841 lzx.decompress(content[base:size], window_bytes)) 842 except lzx.LZXError: 843 self.warn("LZX decompression error; skipping chunk") 844 bytes_remaining -= window_bytes 845 base = size 846 accum += int32(reset_table[RESET_INTERVAL:]) 847 ofs_entry += 8 848 if bytes_remaining < window_bytes and bytes_remaining > 0: 849 lzx.reset() 850 try: 851 result.append(lzx.decompress(content[base:], bytes_remaining)) 852 except lzx.LZXError: 853 self.warn("LZX decompression error; skipping chunk") 854 bytes_remaining = 0 855 if bytes_remaining > 0: 856 raise LitError("Failed to completely decompress section") 857 return b''.join(result) 858 859 def get_atoms(self, entry): 860 name = '/'.join(('/data', entry.internal, 'atom')) 861 if name not in self.entries: 862 return ({}, {}) 863 data = self.get_file(name) 864 nentries, data = u32(data), data[4:] 865 tags = {} 866 for i in range(1, nentries + 1): 867 if len(data) <= 1: 868 break 869 size, data = ord(data[0:1]), data[1:] 870 if size == 0 or len(data) < size: 871 break 872 tags[i], data = data[:size], data[size:] 873 if len(tags) != nentries: 874 self._warn("damaged or invalid atoms tag table") 875 if len(data) < 4: 876 return (tags, {}) 877 attrs = {} 878 nentries, data = u32(data), data[4:] 879 for i in range(1, nentries + 1): 880 if len(data) <= 4: 881 break 882 size, data = u32(data), data[4:] 883 if size == 0 or len(data) < size: 884 break 885 attrs[i], data = data[:size], data[size:] 886 if len(attrs) != nentries: 887 self._warn("damaged or invalid atoms attributes table") 888 return (tags, attrs) 889 890 891class LitContainer: 892 """Simple Container-interface, read-only accessor for LIT files.""" 893 894 def __init__(self, filename_or_stream, log): 895 self._litfile = LitFile(filename_or_stream, log) 896 self.log = log 897 898 def namelist(self): 899 return self._litfile.paths.keys() 900 901 def exists(self, name): 902 return urlunquote(name) in self._litfile.paths 903 904 def read(self, name): 905 entry = self._litfile.paths[urlunquote(name)] if name else None 906 if entry is None: 907 content = OPF_DECL + self._read_meta() 908 elif 'spine' in entry.state: 909 internal = '/'.join(('/data', entry.internal, 'content')) 910 raw = self._litfile.get_file(internal) 911 manifest = self._litfile.manifest 912 atoms = self._litfile.get_atoms(entry) 913 unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms) 914 content = HTML_DECL + unbin.unicode_representation 915 tags = ('personname', 'place', 'city', 'country-region') 916 pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags)) 917 content = re.sub(pat, '', content) 918 content = re.sub(r'<(/{0,1})form>', r'<\1div>', content) 919 else: 920 internal = '/'.join(('/data', entry.internal)) 921 content = self._litfile.get_file(internal) 922 return content 923 924 def _read_meta(self): 925 path = 'content.opf' 926 raw = self._litfile.get_file('/meta') 927 try: 928 unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) 929 except LitError: 930 if b'PENGUIN group' not in raw: 931 raise 932 print("WARNING: attempting PENGUIN malformed OPF fix") 933 raw = raw.replace( 934 b'PENGUIN group', b'\x00\x01\x18\x00PENGUIN group', 1) 935 unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) 936 return unbin.unicode_representation 937 938 def get_metadata(self): 939 return self._read_meta() 940 941 942class LitReader(OEBReader): 943 Container = LitContainer 944 DEFAULT_PROFILE = 'MSReader' 945 946 def _spine_from_opf(self, opf): 947 manifest = self.oeb.manifest 948 for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): 949 idref = elem.get('idref') 950 if idref not in manifest.ids: 951 continue 952 item = manifest.ids[idref] 953 if (item.media_type.lower() == 'application/xml' and 954 hasattr(item.data, 'xpath') and item.data.xpath('/html')): 955 item.media_type = 'application/xhtml+xml' 956 item.data = item._parse_xhtml(etree.tostring(item.data)) 957 super()._spine_from_opf(opf) 958