1'''
2Support for reading LIT files.
3'''
4
5__license__   = 'GPL v3'
6__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
7    'and Marshall T. Vandegrift <llasram@gmail.com>'
8
9import io, struct, os, functools, re
10
11from lxml import etree
12
13from calibre.ebooks.lit import LitError
14from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
15import calibre.ebooks.lit.mssha1 as mssha1
16from calibre.ebooks.oeb.base import urlnormalize, xpath
17from calibre.ebooks.oeb.reader import OEBReader
18from calibre.ebooks import DRMError
19from polyglot.builtins import codepoint_to_chr, string_or_bytes, itervalues
20from polyglot.urllib import unquote as urlunquote, urldefrag
21from calibre_extensions import lzx, msdes
22
23__all__ = ["LitReader"]
24
25XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
26"""
27OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
28<!DOCTYPE package
29  PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
30  "http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
31"""
32HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
33<!DOCTYPE html PUBLIC
34 "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN"
35 "http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd">
36"""
37
38DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}"
39LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}"
40
41CONTROL_TAG = 4
42CONTROL_WINDOW_SIZE = 12
43RESET_NENTRIES = 4
44RESET_HDRLEN = 12
45RESET_UCLENGTH = 16
46RESET_INTERVAL = 32
47
48FLAG_OPENING = (1 << 0)
49FLAG_CLOSING = (1 << 1)
50FLAG_BLOCK   = (1 << 2)
51FLAG_HEAD    = (1 << 3)
52FLAG_ATOM    = (1 << 4)
53
54
55def u32(bytes):
56    return struct.unpack('<L', bytes[:4])[0]
57
58
59def u16(bytes):
60    return struct.unpack('<H', bytes[:2])[0]
61
62
63def int32(bytes):
64    return struct.unpack('<l', bytes[:4])[0]
65
66
67def encint(byts, remaining):
68    pos, val = 0, 0
69    ba = bytearray(byts)
70    while remaining > 0:
71        b = ba[pos]
72        pos += 1
73        remaining -= 1
74        val <<= 7
75        val |= (b & 0x7f)
76        if b & 0x80 == 0:
77            break
78    return val, byts[pos:], remaining
79
80
81def msguid(bytes):
82    values = struct.unpack("<LHHBBBBBBBB", bytes[:16])
83    return "{%08lX-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X}" % values
84
85
86def read_utf8_char(bytes, pos):
87    c = ord(bytes[pos:pos+1])
88    mask = 0x80
89    if (c & mask):
90        elsize = 0
91        while c & mask:
92            mask >>= 1
93            elsize += 1
94        if (mask <= 1) or (mask == 0x40):
95            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
96    else:
97        elsize = 1
98    if elsize > 1:
99        if elsize + pos > len(bytes):
100            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
101        c &= (mask - 1)
102        for i in range(1, elsize):
103            b = ord(bytes[pos+i:pos+i+1])
104            if (b & 0xC0) != 0x80:
105                raise LitError(
106                    'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i]))
107            c = (c << 6) | (b & 0x3F)
108    return codepoint_to_chr(c), pos+elsize
109
110
111def consume_sized_utf8_string(bytes, zpad=False):
112    result = []
113    slen, pos = read_utf8_char(bytes, 0)
114    for i in range(ord(slen)):
115        char, pos = read_utf8_char(bytes, pos)
116        result.append(char)
117    if zpad and bytes[pos:pos+1] == b'\0':
118        pos += 1
119    return ''.join(result), bytes[pos:]
120
121
122def encode(string):
123    return str(string).encode('ascii', 'xmlcharrefreplace')
124
125
126class UnBinary:
127    AMPERSAND_RE = re.compile(
128        br'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
129    OPEN_ANGLE_RE = re.compile(br'<<(?![!]--)')
130    CLOSE_ANGLE_RE = re.compile(br'(?<!--)>>(?=>>|[^>])')
131    DOUBLE_ANGLE_RE = re.compile(br'([<>])\1')
132    EMPTY_ATOMS = ({},{})
133
134    def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
135        self.manifest = manifest
136        self.tag_map, self.attr_map, self.tag_to_attr_map = map
137        self.is_html = map is HTML_MAP
138        self.tag_atoms, self.attr_atoms = atoms
139        self.dir = os.path.dirname(path)
140        buf = io.BytesIO()
141        self.binary_to_text(bin, buf)
142        self.raw = buf.getvalue().lstrip()
143        self.escape_reserved()
144        self._tree = None
145
146    def escape_reserved(self):
147        raw = self.raw
148        raw = self.AMPERSAND_RE.sub(br'&amp;', raw)
149        raw = self.OPEN_ANGLE_RE.sub(br'&lt;', raw)
150        raw = self.CLOSE_ANGLE_RE.sub(br'&gt;', raw)
151        raw = self.DOUBLE_ANGLE_RE.sub(br'\1', raw)
152        self.raw = raw
153
154    def item_path(self, internal_id):
155        try:
156            target = self.manifest[internal_id].path
157        except KeyError:
158            return internal_id
159        if not self.dir:
160            return target
161        target = target.split('/')
162        base = self.dir.split('/')
163        for index in range(min(len(base), len(target))):
164            if base[index] != target[index]:
165                break
166        else:
167            index += 1
168        relpath = (['..'] * (len(base) - index)) + target[index:]
169        return '/'.join(relpath)
170
171    @property
172    def binary_representation(self):
173        return self.raw
174
175    @property
176    def unicode_representation(self):
177        return self.raw.decode('utf-8')
178
179    def __unicode__(self):
180        return self.unicode_representation
181
182    def __str__(self):
183        return self.unicode_representation
184
185    def binary_to_text(self, bin, buf):
186        stack = [(0, None, None, 0, 0, False, False, 'text', 0)]
187        self.cpos = 0
188        while stack:
189            self.binary_to_text_inner(bin, buf, stack)
190        del self.cpos
191
192    def binary_to_text_inner(self, bin, buf, stack):
193        (depth, tag_name, current_map, dynamic_tag, errors,
194                in_censorship, is_goingdown, state, flags) = stack.pop()
195
196        if state == 'close tag':
197            if not tag_name:
198                raise LitError('Tag ends before it begins.')
199            buf.write(encode(''.join(('</', tag_name, '>'))))
200            dynamic_tag = 0
201            tag_name = None
202            state = 'text'
203
204        while self.cpos < len(bin):
205            c, self.cpos = read_utf8_char(bin, self.cpos)
206            oc = ord(c)
207
208            if state == 'text':
209                if oc == 0:
210                    state = 'get flags'
211                    continue
212                elif c == '\v':
213                    c = '\n'
214                elif c == '>':
215                    c = '>>'
216                elif c == '<':
217                    c = '<<'
218                buf.write(encode(c))
219
220            elif state == 'get flags':
221                if oc == 0:
222                    state = 'text'
223                    continue
224                flags = oc
225                state = 'get tag'
226
227            elif state == 'get tag':
228                state = 'text' if oc == 0 else 'get attr'
229                if flags & FLAG_OPENING:
230                    tag = oc
231                    buf.write(b'<')
232                    if not (flags & FLAG_CLOSING):
233                        is_goingdown = True
234                    if tag == 0x8000:
235                        state = 'get custom length'
236                        continue
237                    if flags & FLAG_ATOM:
238                        if not self.tag_atoms or tag not in self.tag_atoms:
239                            raise LitError(
240                                "atom tag %d not in atom tag list" % tag)
241                        tag_name = self.tag_atoms[tag]
242                        current_map = self.attr_atoms
243                    elif tag < len(self.tag_map):
244                        tag_name = self.tag_map[tag]
245                        current_map = self.tag_to_attr_map[tag]
246                    else:
247                        dynamic_tag += 1
248                        errors += 1
249                        tag_name = '?'+codepoint_to_chr(tag)+'?'
250                        current_map = self.tag_to_attr_map[tag]
251                        print('WARNING: tag %s unknown' % codepoint_to_chr(tag))
252                    buf.write(encode(tag_name))
253                elif flags & FLAG_CLOSING:
254                    if depth == 0:
255                        raise LitError('Extra closing tag %s at %d'%(tag_name,
256                            self.cpos))
257                    break
258
259            elif state == 'get attr':
260                in_censorship = False
261                if oc == 0:
262                    state = 'text'
263                    if not is_goingdown:
264                        tag_name = None
265                        dynamic_tag = 0
266                        buf.write(b' />')
267                    else:
268                        buf.write(b'>')
269                        frame = (depth, tag_name, current_map,
270                            dynamic_tag, errors, in_censorship, False,
271                            'close tag', flags)
272                        stack.append(frame)
273                        frame = (depth+1, None, None, 0, 0,
274                                False, False, 'text', 0)
275                        stack.append(frame)
276                        break
277                else:
278                    if oc == 0x8000:
279                        state = 'get attr length'
280                        continue
281                    attr = None
282                    if current_map and oc in current_map and current_map[oc]:
283                        attr = current_map[oc]
284                    elif oc in self.attr_map:
285                        attr = self.attr_map[oc]
286                    if not attr or not isinstance(attr, string_or_bytes):
287                        raise LitError(
288                            'Unknown attribute %d in tag %s' % (oc, tag_name))
289                    if attr.startswith('%'):
290                        in_censorship = True
291                        state = 'get value length'
292                        continue
293                    buf.write(b' ' + encode(attr) + b'=')
294                    if attr in ['href', 'src']:
295                        state = 'get href length'
296                    else:
297                        state = 'get value length'
298
299            elif state == 'get value length':
300                if not in_censorship:
301                    buf.write(b'"')
302                count = oc - 1
303                if count == 0:
304                    if not in_censorship:
305                        buf.write(b'"')
306                    in_censorship = False
307                    state = 'get attr'
308                    continue
309                state = 'get value'
310                if oc == 0xffff:
311                    continue
312                if count < 0 or count > (len(bin) - self.cpos):
313                    raise LitError('Invalid character count %d' % count)
314
315            elif state == 'get value':
316                if count == 0xfffe:
317                    if not in_censorship:
318                        buf.write(encode('%s"' % (oc - 1)))
319                    in_censorship = False
320                    state = 'get attr'
321                elif count > 0:
322                    if not in_censorship:
323                        if c == '"':
324                            c = '&quot;'
325                        elif c == '<':
326                            c = '&lt;'
327                        if isinstance(c, str):
328                            c = c.encode('ascii', 'xmlcharrefreplace')
329                        buf.write(c)
330                    count -= 1
331                if count == 0:
332                    if not in_censorship:
333                        buf.write(b'"')
334                    in_censorship = False
335                    state = 'get attr'
336
337            elif state == 'get custom length':
338                count = oc - 1
339                if count <= 0 or count > len(bin)-self.cpos:
340                    raise LitError('Invalid character count %d' % count)
341                dynamic_tag += 1
342                state = 'get custom'
343                tag_name = ''
344
345            elif state == 'get custom':
346                tag_name += c
347                count -= 1
348                if count == 0:
349                    buf.write(encode(tag_name))
350                    state = 'get attr'
351
352            elif state == 'get attr length':
353                count = oc - 1
354                if count <= 0 or count > (len(bin) - self.cpos):
355                    raise LitError('Invalid character count %d' % count)
356                buf.write(b' ')
357                state = 'get custom attr'
358
359            elif state == 'get custom attr':
360                buf.write(encode(c))
361                count -= 1
362                if count == 0:
363                    buf.write(b'=')
364                    state = 'get value length'
365
366            elif state == 'get href length':
367                count = oc - 1
368                if count <= 0 or count > (len(bin) - self.cpos):
369                    raise LitError('Invalid character count %d' % count)
370                href = ''
371                state = 'get href'
372
373            elif state == 'get href':
374                href += c
375                count -= 1
376                if count == 0:
377                    doc, frag = urldefrag(href[1:])
378                    path = self.item_path(doc)
379                    if frag:
380                        path = '#'.join((path, frag))
381                    path = urlnormalize(path)
382                    buf.write(encode('"%s"' % path))
383                    state = 'get attr'
384
385
386class DirectoryEntry:
387
388    def __init__(self, name, section, offset, size):
389        self.name = name
390        self.section = section
391        self.offset = offset
392        self.size = size
393
394    def __repr__(self):
395        return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \
396            % (repr(self.name), self.section, self.offset, self.size)
397
398    def __str__(self):
399        return repr(self)
400
401
402class ManifestItem:
403
404    def __init__(self, original, internal, mime_type, offset, root, state):
405        self.original = original
406        self.internal = internal
407        self.mime_type = mime_type.lower() if hasattr(mime_type, 'lower') else mime_type
408        self.offset = offset
409        self.root = root
410        self.state = state
411        # Some LIT files have Windows-style paths
412        path = original.replace('\\', '/')
413        if path[1:3] == ':/':
414            path = path[2:]
415        # Some paths in Fictionwise "multiformat" LIT files contain '..' (!?)
416        path = os.path.normpath(path).replace('\\', '/')
417        while path.startswith('../'):
418            path = path[3:]
419        self.path = path
420
421    def __eq__(self, other):
422        if hasattr(other, 'internal'):
423            return self.internal == other.internal
424        return self.internal == other
425
426    def __repr__(self):
427        return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \
428            "offset=%d, root=%r, state=%r)" \
429            % (self.internal, self.path, self.mime_type, self.offset,
430               self.root, self.state)
431
432
433def preserve(function):
434    def wrapper(self, *args, **kwargs):
435        opos = self.stream.tell()
436        try:
437            return function(self, *args, **kwargs)
438        finally:
439            self.stream.seek(opos)
440    functools.update_wrapper(wrapper, function)
441    return wrapper
442
443
444class LitFile:
445    PIECE_SIZE = 16
446
447    def __init__(self, filename_or_stream, log):
448        self._warn = log.warn
449        if hasattr(filename_or_stream, 'read'):
450            self.stream = filename_or_stream
451        else:
452            self.stream = open(filename_or_stream, 'rb')
453        try:
454            self.opf_path = os.path.splitext(
455                os.path.basename(self.stream.name))[0] + '.opf'
456        except AttributeError:
457            self.opf_path = 'content.opf'
458        if self.magic != b'ITOLITLS':
459            raise LitError('Not a valid LIT file')
460        if self.version != 1:
461            raise LitError('Unknown LIT version %d' % (self.version,))
462        self.read_secondary_header()
463        self.read_header_pieces()
464        self.read_section_names()
465        self.read_manifest()
466        self.read_drm()
467
468    def warn(self, msg):
469        self._warn(msg)
470
471    def magic():
472        @preserve
473        def fget(self):
474            self.stream.seek(0)
475            return self.stream.read(8)
476        return property(fget=fget)
477    magic = magic()
478
479    def version():
480        def fget(self):
481            self.stream.seek(8)
482            return u32(self.stream.read(4))
483        return property(fget=fget)
484    version = version()
485
486    def hdr_len():
487        @preserve
488        def fget(self):
489            self.stream.seek(12)
490            return int32(self.stream.read(4))
491        return property(fget=fget)
492    hdr_len = hdr_len()
493
494    def num_pieces():
495        @preserve
496        def fget(self):
497            self.stream.seek(16)
498            return int32(self.stream.read(4))
499        return property(fget=fget)
500    num_pieces = num_pieces()
501
502    def sec_hdr_len():
503        @preserve
504        def fget(self):
505            self.stream.seek(20)
506            return int32(self.stream.read(4))
507        return property(fget=fget)
508    sec_hdr_len = sec_hdr_len()
509
510    def guid():
511        @preserve
512        def fget(self):
513            self.stream.seek(24)
514            return self.stream.read(16)
515        return property(fget=fget)
516    guid = guid()
517
518    def header():
519        @preserve
520        def fget(self):
521            size = self.hdr_len \
522                + (self.num_pieces * self.PIECE_SIZE) \
523                + self.sec_hdr_len
524            self.stream.seek(0)
525            return self.stream.read(size)
526        return property(fget=fget)
527    header = header()
528
529    @preserve
530    def __len__(self):
531        self.stream.seek(0, 2)
532        return self.stream.tell()
533
534    @preserve
535    def read_raw(self, offset, size):
536        self.stream.seek(offset)
537        return self.stream.read(size)
538
539    def read_content(self, offset, size):
540        return self.read_raw(self.content_offset + offset, size)
541
542    def read_secondary_header(self):
543        offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
544        byts = self.read_raw(offset, self.sec_hdr_len)
545        offset = int32(byts[4:])
546        while offset < len(byts):
547            blocktype = byts[offset:offset+4]
548            blockver  = u32(byts[offset+4:])
549            if blocktype == b'CAOL':
550                if blockver != 2:
551                    raise LitError(
552                        'Unknown CAOL block format %d' % blockver)
553                self.creator_id     = u32(byts[offset+12:])
554                self.entry_chunklen = u32(byts[offset+20:])
555                self.count_chunklen = u32(byts[offset+24:])
556                self.entry_unknown  = u32(byts[offset+28:])
557                self.count_unknown  = u32(byts[offset+32:])
558                offset += 48
559            elif blocktype == b'ITSF':
560                if blockver != 4:
561                    raise LitError(
562                        'Unknown ITSF block format %d' % blockver)
563                if u32(byts[offset+4+16:]):
564                    raise LitError('This file has a 64bit content offset')
565                self.content_offset = u32(byts[offset+16:])
566                self.timestamp      = u32(byts[offset+24:])
567                self.language_id    = u32(byts[offset+28:])
568                offset += 48
569        if not hasattr(self, 'content_offset'):
570            raise LitError('Could not figure out the content offset')
571
572    def read_header_pieces(self):
573        src = self.header[self.hdr_len:]
574        for i in range(self.num_pieces):
575            piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE]
576            if u32(piece[4:]) != 0 or u32(piece[12:]) != 0:
577                raise LitError('Piece %s has 64bit value' % repr(piece))
578            offset, size = u32(piece), int32(piece[8:])
579            piece = self.read_raw(offset, size)
580            if i == 0:
581                continue  # Dont need this piece
582            elif i == 1:
583                if u32(piece[8:])  != self.entry_chunklen or \
584                   u32(piece[12:]) != self.entry_unknown:
585                    raise LitError('Secondary header does not match piece')
586                self.read_directory(piece)
587            elif i == 2:
588                if u32(piece[8:])  != self.count_chunklen or \
589                   u32(piece[12:]) != self.count_unknown:
590                    raise LitError('Secondary header does not match piece')
591                continue  # No data needed from this piece
592            elif i == 3:
593                self.piece3_guid = piece
594            elif i == 4:
595                self.piece4_guid = piece
596
597    def read_directory(self, piece):
598        if not piece.startswith(b'IFCM'):
599            raise LitError('Header piece #1 is not main directory.')
600        chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
601        if (32 + (num_chunks * chunk_size)) != len(piece):
602            raise LitError('IFCM header has incorrect length')
603        self.entries = {}
604        for i in range(num_chunks):
605            offset = 32 + (i * chunk_size)
606            chunk = piece[offset:offset + chunk_size]
607            tag, chunk = chunk[:4], chunk[4:]
608            if tag != b'AOLL':
609                continue
610            remaining, chunk = int32(chunk[:4]), chunk[4:]
611            if remaining >= chunk_size:
612                raise LitError('AOLL remaining count is negative')
613            remaining = chunk_size - (remaining + 48)
614            entries = u16(chunk[-2:])
615            if entries == 0:
616                # Hopefully will work even without a correct entries count
617                entries = (2 ** 16) - 1
618            chunk = chunk[40:]
619            for j in range(entries):
620                if remaining <= 0:
621                    break
622                namelen, chunk, remaining = encint(chunk, remaining)
623                if namelen != (namelen & 0x7fffffff):
624                    raise LitError('Directory entry had 64bit name length.')
625                if namelen > remaining - 3:
626                    raise LitError('Read past end of directory chunk')
627                try:
628                    name = chunk[:namelen].decode('utf-8')
629                    chunk = chunk[namelen:]
630                    remaining -= namelen
631                except UnicodeDecodeError:
632                    break
633                section, chunk, remaining = encint(chunk, remaining)
634                offset, chunk, remaining = encint(chunk, remaining)
635                size, chunk, remaining = encint(chunk, remaining)
636                entry = DirectoryEntry(name, section, offset, size)
637                self.entries[name] = entry
638
639    def read_section_names(self):
640        if '::DataSpace/NameList' not in self.entries:
641            raise LitError('Lit file does not have a valid NameList')
642        raw = self.get_file('::DataSpace/NameList')
643        if len(raw) < 4:
644            raise LitError('Invalid Namelist section')
645        pos = 4
646        num_sections = u16(raw[2:pos])
647        self.section_names = [""] * num_sections
648        self.section_data = [None] * num_sections
649        for section in range(num_sections):
650            size = u16(raw[pos:pos+2])
651            pos += 2
652            size = size*2 + 2
653            if pos + size > len(raw):
654                raise LitError('Invalid Namelist section')
655            self.section_names[section] = \
656                raw[pos:pos+size].decode('utf-16-le').rstrip('\0')
657            pos += size
658
659    def read_manifest(self):
660        if '/manifest' not in self.entries:
661            raise LitError('Lit file does not have a valid manifest')
662        raw = self.get_file('/manifest')
663        self.manifest = {}
664        self.paths = {self.opf_path: None}
665        while raw:
666            slen, raw = ord(raw[0:1]), raw[1:]
667            if slen == 0:
668                break
669            root, raw = raw[:slen].decode('utf8'), raw[slen:]
670            if not raw:
671                raise LitError('Truncated manifest')
672            for state in ['spine', 'not spine', 'css', 'images']:
673                num_files, raw = int32(raw), raw[4:]
674                if num_files == 0:
675                    continue
676                for i in range(num_files):
677                    if len(raw) < 5:
678                        raise LitError('Truncated manifest')
679                    offset, raw = u32(raw), raw[4:]
680                    internal, raw = consume_sized_utf8_string(raw)
681                    original, raw = consume_sized_utf8_string(raw)
682                    # The path should be stored unquoted, but not always
683                    original = urlunquote(original)
684                    # Is this last one UTF-8 or ASCIIZ?
685                    mime_type, raw = consume_sized_utf8_string(raw, zpad=True)
686                    self.manifest[internal] = ManifestItem(
687                        original, internal, mime_type, offset, root, state)
688        mlist = list(itervalues(self.manifest))
689        # Remove any common path elements
690        if len(mlist) > 1:
691            shared = mlist[0].path
692            for item in mlist[1:]:
693                path = item.path
694                while shared and not path.startswith(shared):
695                    try:
696                        shared = shared[:shared.rindex("/", 0, -2) + 1]
697                    except ValueError:
698                        shared = None
699                if not shared:
700                    break
701            if shared:
702                slen = len(shared)
703                for item in mlist:
704                    item.path = item.path[slen:]
705        # Fix any straggling absolute paths
706        for item in mlist:
707            if item.path[0] == '/':
708                item.path = os.path.basename(item.path)
709            self.paths[item.path] = item
710
711    def read_drm(self):
712        self.drmlevel = 0
713        if '/DRMStorage/Licenses/EUL' in self.entries:
714            self.drmlevel = 5
715        elif '/DRMStorage/DRMBookplate' in self.entries:
716            self.drmlevel = 3
717        elif '/DRMStorage/DRMSealed' in self.entries:
718            self.drmlevel = 1
719        else:
720            return
721        if self.drmlevel < 5:
722            msdes.deskey(self.calculate_deskey(), msdes.DE1)
723            bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
724            if bookkey[0:1] != b'\0':
725                raise LitError('Unable to decrypt title key!')
726            self.bookkey = bookkey[1:9]
727        else:
728            raise DRMError("Cannot access DRM-protected book")
729
730    def calculate_deskey(self):
731        hashfiles = ['/meta', '/DRMStorage/DRMSource']
732        if self.drmlevel == 3:
733            hashfiles.append('/DRMStorage/DRMBookplate')
734        prepad = 2
735        hash = mssha1.new()
736        for name in hashfiles:
737            data = self.get_file(name)
738            if prepad > 0:
739                data = (b"\000" * prepad) + data
740                prepad = 0
741            postpad = 64 - (len(data) % 64)
742            if postpad < 64:
743                data = data + (b"\000" * postpad)
744            hash.update(data)
745        digest = hash.digest()
746        if not isinstance(digest, bytes):
747            digest = digest.encode('ascii')
748        digest = bytearray(digest)
749        key = bytearray(8)
750        for i, d in enumerate(digest):
751            key[i % 8] ^= d
752        return bytes(key)
753
754    def get_file(self, name):
755        entry = self.entries[name]
756        if entry.section == 0:
757            return self.read_content(entry.offset, entry.size)
758        section = self.get_section(entry.section)
759        return section[entry.offset:entry.offset+entry.size]
760
761    def get_section(self, section):
762        data = self.section_data[section]
763        if not data:
764            data = self.get_section_uncached(section)
765            self.section_data[section] = data
766        return data
767
768    def get_section_uncached(self, section):
769        name = self.section_names[section]
770        path = '::DataSpace/Storage/' + name
771        transform = self.get_file(path + '/Transform/List')
772        content = self.get_file(path + '/Content')
773        control = self.get_file(path + '/ControlData')
774        while len(transform) >= 16:
775            csize = (int32(control) + 1) * 4
776            if csize > len(control) or csize <= 0:
777                raise LitError("ControlData is too short")
778            guid = msguid(transform)
779            if guid == DESENCRYPT_GUID:
780                content = self.decrypt(content)
781                control = control[csize:]
782            elif guid == LZXCOMPRESS_GUID:
783                reset_table = self.get_file(
784                    '/'.join(('::DataSpace/Storage', name, 'Transform',
785                              LZXCOMPRESS_GUID, 'InstanceData/ResetTable')))
786                content = self.decompress(content, control, reset_table)
787                control = control[csize:]
788            else:
789                raise LitError("Unrecognized transform: %s." % repr(guid))
790            transform = transform[16:]
791        return content
792
793    def decrypt(self, content):
794        length = len(content)
795        extra = length & 0x7
796        if extra > 0:
797            self.warn("content length not a multiple of block size")
798            content += b"\0" * (8 - extra)
799        msdes.deskey(self.bookkey, msdes.DE1)
800        return msdes.des(content)
801
802    def decompress(self, content, control, reset_table):
803        if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != b"LZXC":
804            raise LitError("Invalid ControlData tag value")
805        if len(reset_table) < (RESET_INTERVAL + 8):
806            raise LitError("Reset table is too short")
807        if u32(reset_table[RESET_UCLENGTH + 4:]) != 0:
808            raise LitError("Reset table has 64bit value for UCLENGTH")
809
810        result = []
811
812        window_size = 14
813        u = u32(control[CONTROL_WINDOW_SIZE:])
814        while u > 0:
815            u >>= 1
816            window_size += 1
817        if window_size < 15 or window_size > 21:
818            raise LitError("Invalid window in ControlData")
819        lzx.init(window_size)
820
821        ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8
822        uclength = int32(reset_table[RESET_UCLENGTH:])
823        accum = int32(reset_table[RESET_INTERVAL:])
824        bytes_remaining = uclength
825        window_bytes = (1 << window_size)
826        base = 0
827
828        while ofs_entry < len(reset_table):
829            if accum >= window_bytes:
830                accum = 0
831                size = int32(reset_table[ofs_entry:])
832                u = int32(reset_table[ofs_entry + 4:])
833                if u != 0:
834                    raise LitError("Reset table entry greater than 32 bits")
835                if size >= len(content):
836                    self._warn("LZX reset table entry out of bounds")
837                if bytes_remaining >= window_bytes:
838                    lzx.reset()
839                    try:
840                        result.append(
841                            lzx.decompress(content[base:size], window_bytes))
842                    except lzx.LZXError:
843                        self.warn("LZX decompression error; skipping chunk")
844                    bytes_remaining -= window_bytes
845                    base = size
846            accum += int32(reset_table[RESET_INTERVAL:])
847            ofs_entry += 8
848        if bytes_remaining < window_bytes and bytes_remaining > 0:
849            lzx.reset()
850            try:
851                result.append(lzx.decompress(content[base:], bytes_remaining))
852            except lzx.LZXError:
853                self.warn("LZX decompression error; skipping chunk")
854            bytes_remaining = 0
855        if bytes_remaining > 0:
856            raise LitError("Failed to completely decompress section")
857        return b''.join(result)
858
859    def get_atoms(self, entry):
860        name = '/'.join(('/data', entry.internal, 'atom'))
861        if name not in self.entries:
862            return ({}, {})
863        data = self.get_file(name)
864        nentries, data = u32(data), data[4:]
865        tags = {}
866        for i in range(1, nentries + 1):
867            if len(data) <= 1:
868                break
869            size, data = ord(data[0:1]), data[1:]
870            if size == 0 or len(data) < size:
871                break
872            tags[i], data = data[:size], data[size:]
873        if len(tags) != nentries:
874            self._warn("damaged or invalid atoms tag table")
875        if len(data) < 4:
876            return (tags, {})
877        attrs = {}
878        nentries, data = u32(data), data[4:]
879        for i in range(1, nentries + 1):
880            if len(data) <= 4:
881                break
882            size, data = u32(data), data[4:]
883            if size == 0 or len(data) < size:
884                break
885            attrs[i], data = data[:size], data[size:]
886        if len(attrs) != nentries:
887            self._warn("damaged or invalid atoms attributes table")
888        return (tags, attrs)
889
890
891class LitContainer:
892    """Simple Container-interface, read-only accessor for LIT files."""
893
894    def __init__(self, filename_or_stream, log):
895        self._litfile = LitFile(filename_or_stream, log)
896        self.log = log
897
898    def namelist(self):
899        return self._litfile.paths.keys()
900
901    def exists(self, name):
902        return urlunquote(name) in self._litfile.paths
903
904    def read(self, name):
905        entry = self._litfile.paths[urlunquote(name)] if name else None
906        if entry is None:
907            content = OPF_DECL + self._read_meta()
908        elif 'spine' in entry.state:
909            internal = '/'.join(('/data', entry.internal, 'content'))
910            raw = self._litfile.get_file(internal)
911            manifest = self._litfile.manifest
912            atoms = self._litfile.get_atoms(entry)
913            unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
914            content = HTML_DECL + unbin.unicode_representation
915            tags = ('personname', 'place', 'city', 'country-region')
916            pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags))
917            content = re.sub(pat, '', content)
918            content = re.sub(r'<(/{0,1})form>', r'<\1div>', content)
919        else:
920            internal = '/'.join(('/data', entry.internal))
921            content = self._litfile.get_file(internal)
922        return content
923
924    def _read_meta(self):
925        path = 'content.opf'
926        raw = self._litfile.get_file('/meta')
927        try:
928            unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
929        except LitError:
930            if b'PENGUIN group' not in raw:
931                raise
932            print("WARNING: attempting PENGUIN malformed OPF fix")
933            raw = raw.replace(
934                b'PENGUIN group', b'\x00\x01\x18\x00PENGUIN group', 1)
935            unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
936        return unbin.unicode_representation
937
938    def get_metadata(self):
939        return self._read_meta()
940
941
942class LitReader(OEBReader):
943    Container = LitContainer
944    DEFAULT_PROFILE = 'MSReader'
945
946    def _spine_from_opf(self, opf):
947        manifest = self.oeb.manifest
948        for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
949            idref = elem.get('idref')
950            if idref not in manifest.ids:
951                continue
952            item = manifest.ids[idref]
953            if (item.media_type.lower() == 'application/xml' and
954                hasattr(item.data, 'xpath') and item.data.xpath('/html')):
955                item.media_type = 'application/xhtml+xml'
956                item.data = item._parse_xhtml(etree.tostring(item.data))
957        super()._spine_from_opf(opf)
958