1import logging
2import re
3import struct
4from hashlib import sha256, md5
5
6from cryptography.hazmat.backends import default_backend
7from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
8
9from . import settings
10from .arcfour import Arcfour
11from .pdfparser import PDFSyntaxError, PDFStreamParser
12from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
13    PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
14    dict_value, stream_value
15from .psparser import PSEOF, literal_name, LIT, KWD
16from .utils import choplist, nunpack, decode_text
17
18log = logging.getLogger(__name__)
19
20
21class PDFNoValidXRef(PDFSyntaxError):
22    pass
23
24
25class PDFNoValidXRefWarning(SyntaxWarning):
26    pass
27
28
29class PDFNoOutlines(PDFException):
30    pass
31
32
33class PDFDestinationNotFound(PDFException):
34    pass
35
36
37class PDFEncryptionError(PDFException):
38    pass
39
40
41class PDFPasswordIncorrect(PDFEncryptionError):
42    pass
43
44
45class PDFTextExtractionNotAllowedWarning(UserWarning):
46    pass
47
48
49class PDFTextExtractionNotAllowed(PDFEncryptionError):
50    pass
51
52
53class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
54    def __init__(self, *args):
55        from warnings import warn
56        warn('PDFTextExtractionNotAllowedError will be removed in the future. '
57             'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
58        super().__init__(*args)
59
60
61# some predefined literals and keywords.
62LITERAL_OBJSTM = LIT('ObjStm')
63LITERAL_XREF = LIT('XRef')
64LITERAL_CATALOG = LIT('Catalog')
65
66
67class PDFBaseXRef:
68
69    def get_trailer(self):
70        raise NotImplementedError
71
72    def get_objids(self):
73        return []
74
75    # Must return
76    #     (strmid, index, genno)
77    #  or (None, pos, genno)
78    def get_pos(self, objid):
79        raise KeyError(objid)
80
81
82class PDFXRef(PDFBaseXRef):
83
84    def __init__(self):
85        self.offsets = {}
86        self.trailer = {}
87        return
88
89    def __repr__(self):
90        return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
91
92    def load(self, parser):
93        while True:
94            try:
95                (pos, line) = parser.nextline()
96                if not line.strip():
97                    continue
98            except PSEOF:
99                raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
100            if not line:
101                raise PDFNoValidXRef('Premature eof: %r' % parser)
102            if line.startswith(b'trailer'):
103                parser.seek(pos)
104                break
105            f = line.strip().split(b' ')
106            if len(f) != 2:
107                error_msg = 'Trailer not found: {!r}: line={!r}'\
108                    .format(parser, line)
109                raise PDFNoValidXRef(error_msg)
110            try:
111                (start, nobjs) = map(int, f)
112            except ValueError:
113                error_msg = 'Invalid line: {!r}: line={!r}'\
114                    .format(parser, line)
115                raise PDFNoValidXRef(error_msg)
116            for objid in range(start, start+nobjs):
117                try:
118                    (_, line) = parser.nextline()
119                except PSEOF:
120                    raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
121                f = line.strip().split(b' ')
122                if len(f) != 3:
123                    error_msg = 'Invalid XRef format: {!r}, line={!r}'\
124                        .format(parser, line)
125                    raise PDFNoValidXRef(error_msg)
126                (pos, genno, use) = f
127                if use != b'n':
128                    continue
129                self.offsets[objid] = (None, int(pos), int(genno))
130        log.info('xref objects: %r', self.offsets)
131        self.load_trailer(parser)
132        return
133
134    def load_trailer(self, parser):
135        try:
136            (_, kwd) = parser.nexttoken()
137            assert kwd is KWD(b'trailer'), str(kwd)
138            (_, dic) = parser.nextobject()
139        except PSEOF:
140            x = parser.pop(1)
141            if not x:
142                raise PDFNoValidXRef('Unexpected EOF - file corrupted')
143            (_, dic) = x[0]
144        self.trailer.update(dict_value(dic))
145        log.debug('trailer=%r', self.trailer)
146        return
147
148    def get_trailer(self):
149        return self.trailer
150
151    def get_objids(self):
152        return self.offsets.keys()
153
154    def get_pos(self, objid):
155        try:
156            return self.offsets[objid]
157        except KeyError:
158            raise
159
160
161class PDFXRefFallback(PDFXRef):
162
163    def __repr__(self):
164        return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
165
166    PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
167
168    def load(self, parser):
169        parser.seek(0)
170        while 1:
171            try:
172                (pos, line) = parser.nextline()
173            except PSEOF:
174                break
175            if line.startswith(b'trailer'):
176                parser.seek(pos)
177                self.load_trailer(parser)
178                log.info('trailer: %r', self.trailer)
179                break
180            line = line.decode('latin-1')  # default pdf encoding
181            m = self.PDFOBJ_CUE.match(line)
182            if not m:
183                continue
184            (objid, genno) = m.groups()
185            objid = int(objid)
186            genno = int(genno)
187            self.offsets[objid] = (None, pos, genno)
188            # expand ObjStm.
189            parser.seek(pos)
190            (_, obj) = parser.nextobject()
191            if isinstance(obj, PDFStream) \
192                    and obj.get('Type') is LITERAL_OBJSTM:
193                stream = stream_value(obj)
194                try:
195                    n = stream['N']
196                except KeyError:
197                    if settings.STRICT:
198                        raise PDFSyntaxError('N is not defined: %r' % stream)
199                    n = 0
200                parser1 = PDFStreamParser(stream.get_data())
201                objs = []
202                try:
203                    while 1:
204                        (_, obj) = parser1.nextobject()
205                        objs.append(obj)
206                except PSEOF:
207                    pass
208                n = min(n, len(objs)//2)
209                for index in range(n):
210                    objid1 = objs[index*2]
211                    self.offsets[objid1] = (objid, index, 0)
212        return
213
214
215class PDFXRefStream(PDFBaseXRef):
216
217    def __init__(self):
218        self.data = None
219        self.entlen = None
220        self.fl1 = self.fl2 = self.fl3 = None
221        self.ranges = []
222        return
223
224    def __repr__(self):
225        return '<PDFXRefStream: ranges=%r>' % (self.ranges)
226
227    def load(self, parser):
228        (_, objid) = parser.nexttoken()  # ignored
229        (_, genno) = parser.nexttoken()  # ignored
230        (_, kwd) = parser.nexttoken()
231        (_, stream) = parser.nextobject()
232        if not isinstance(stream, PDFStream) \
233                or stream['Type'] is not LITERAL_XREF:
234            raise PDFNoValidXRef('Invalid PDF stream spec.')
235        size = stream['Size']
236        index_array = stream.get('Index', (0, size))
237        if len(index_array) % 2 != 0:
238            raise PDFSyntaxError('Invalid index number')
239        self.ranges.extend(choplist(2, index_array))
240        (self.fl1, self.fl2, self.fl3) = stream['W']
241        self.data = stream.get_data()
242        self.entlen = self.fl1+self.fl2+self.fl3
243        self.trailer = stream.attrs
244        log.info('xref stream: objid=%s, fields=%d,%d,%d',
245                 ', '.join(map(repr, self.ranges)),
246                 self.fl1, self.fl2, self.fl3)
247        return
248
249    def get_trailer(self):
250        return self.trailer
251
252    def get_objids(self):
253        for (start, nobjs) in self.ranges:
254            for i in range(nobjs):
255                offset = self.entlen * i
256                ent = self.data[offset:offset+self.entlen]
257                f1 = nunpack(ent[:self.fl1], 1)
258                if f1 == 1 or f1 == 2:
259                    yield start+i
260        return
261
262    def get_pos(self, objid):
263        index = 0
264        for (start, nobjs) in self.ranges:
265            if start <= objid and objid < start+nobjs:
266                index += objid - start
267                break
268            else:
269                index += nobjs
270        else:
271            raise KeyError(objid)
272        offset = self.entlen * index
273        ent = self.data[offset:offset+self.entlen]
274        f1 = nunpack(ent[:self.fl1], 1)
275        f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
276        f3 = nunpack(ent[self.fl1+self.fl2:])
277        if f1 == 1:
278            return (None, f2, f3)
279        elif f1 == 2:
280            return (f2, f3, 0)
281        else:
282            # this is a free object
283            raise KeyError(objid)
284
285
286class PDFStandardSecurityHandler:
287
288    PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
289                        b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
290    supported_revisions = (2, 3)
291
292    def __init__(self, docid, param, password=''):
293        self.docid = docid
294        self.param = param
295        self.password = password
296        self.init()
297        return
298
299    def init(self):
300        self.init_params()
301        if self.r not in self.supported_revisions:
302            error_msg = 'Unsupported revision: param=%r' % self.param
303            raise PDFEncryptionError(error_msg)
304        self.init_key()
305        return
306
307    def init_params(self):
308        self.v = int_value(self.param.get('V', 0))
309        self.r = int_value(self.param['R'])
310        self.p = uint_value(self.param['P'], 32)
311        self.o = str_value(self.param['O'])
312        self.u = str_value(self.param['U'])
313        self.length = int_value(self.param.get('Length', 40))
314        return
315
316    def init_key(self):
317        self.key = self.authenticate(self.password)
318        if self.key is None:
319            raise PDFPasswordIncorrect
320        return
321
322    def is_printable(self):
323        return bool(self.p & 4)
324
325    def is_modifiable(self):
326        return bool(self.p & 8)
327
328    def is_extractable(self):
329        return bool(self.p & 16)
330
331    def compute_u(self, key):
332        if self.r == 2:
333            # Algorithm 3.4
334            return Arcfour(key).encrypt(self.PASSWORD_PADDING)  # 2
335        else:
336            # Algorithm 3.5
337            hash = md5(self.PASSWORD_PADDING)  # 2
338            hash.update(self.docid[0])  # 3
339            result = Arcfour(key).encrypt(hash.digest())  # 4
340            for i in range(1, 20):  # 5
341                k = b''.join(bytes((c ^ i,)) for c in iter(key))
342                result = Arcfour(k).encrypt(result)
343            result += result  # 6
344            return result
345
346    def compute_encryption_key(self, password):
347        # Algorithm 3.2
348        password = (password + self.PASSWORD_PADDING)[:32]  # 1
349        hash = md5(password)  # 2
350        hash.update(self.o)  # 3
351        # See https://github.com/pdfminer/pdfminer.six/issues/186
352        hash.update(struct.pack('<L', self.p))  # 4
353        hash.update(self.docid[0])  # 5
354        if self.r >= 4:
355            if not self.encrypt_metadata:
356                hash.update(b'\xff\xff\xff\xff')
357        result = hash.digest()
358        n = 5
359        if self.r >= 3:
360            n = self.length // 8
361            for _ in range(50):
362                result = md5(result[:n]).digest()
363        return result[:n]
364
365    def authenticate(self, password):
366        password = password.encode("latin1")
367        key = self.authenticate_user_password(password)
368        if key is None:
369            key = self.authenticate_owner_password(password)
370        return key
371
372    def authenticate_user_password(self, password):
373        key = self.compute_encryption_key(password)
374        if self.verify_encryption_key(key):
375            return key
376        else:
377            return None
378
379    def verify_encryption_key(self, key):
380        # Algorithm 3.6
381        u = self.compute_u(key)
382        if self.r == 2:
383            return u == self.u
384        return u[:16] == self.u[:16]
385
386    def authenticate_owner_password(self, password):
387        # Algorithm 3.7
388        password = (password + self.PASSWORD_PADDING)[:32]
389        hash = md5(password)
390        if self.r >= 3:
391            for _ in range(50):
392                hash = md5(hash.digest())
393        n = 5
394        if self.r >= 3:
395            n = self.length // 8
396        key = hash.digest()[:n]
397        if self.r == 2:
398            user_password = Arcfour(key).decrypt(self.o)
399        else:
400            user_password = self.o
401            for i in range(19, -1, -1):
402                k = b''.join(bytes((c ^ i,)) for c in iter(key))
403                user_password = Arcfour(k).decrypt(user_password)
404        return self.authenticate_user_password(user_password)
405
406    def decrypt(self, objid, genno, data, attrs=None):
407        return self.decrypt_rc4(objid, genno, data)
408
409    def decrypt_rc4(self, objid, genno, data):
410        key = self.key + struct.pack('<L', objid)[:3] \
411              + struct.pack('<L', genno)[:2]
412        hash = md5(key)
413        key = hash.digest()[:min(len(key), 16)]
414        return Arcfour(key).decrypt(data)
415
416
417class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
418
419    supported_revisions = (4,)
420
421    def init_params(self):
422        super().init_params()
423        self.length = 128
424        self.cf = dict_value(self.param.get('CF'))
425        self.stmf = literal_name(self.param['StmF'])
426        self.strf = literal_name(self.param['StrF'])
427        self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
428        if self.stmf != self.strf:
429            error_msg = 'Unsupported crypt filter: param=%r' % self.param
430            raise PDFEncryptionError(error_msg)
431        self.cfm = {}
432        for k, v in self.cf.items():
433            f = self.get_cfm(literal_name(v['CFM']))
434            if f is None:
435                error_msg = 'Unknown crypt filter method: param=%r' \
436                            % self.param
437                raise PDFEncryptionError(error_msg)
438            self.cfm[k] = f
439        self.cfm['Identity'] = self.decrypt_identity
440        if self.strf not in self.cfm:
441            error_msg = 'Undefined crypt filter: param=%r' % self.param
442            raise PDFEncryptionError(error_msg)
443        return
444
445    def get_cfm(self, name):
446        if name == 'V2':
447            return self.decrypt_rc4
448        elif name == 'AESV2':
449            return self.decrypt_aes128
450        else:
451            return None
452
453    def decrypt(self, objid, genno, data, attrs=None, name=None):
454        if not self.encrypt_metadata and attrs is not None:
455            t = attrs.get('Type')
456            if t is not None and literal_name(t) == 'Metadata':
457                return data
458        if name is None:
459            name = self.strf
460        return self.cfm[name](objid, genno, data)
461
462    def decrypt_identity(self, objid, genno, data):
463        return data
464
465    def decrypt_aes128(self, objid, genno, data):
466        key = self.key + struct.pack('<L', objid)[:3] \
467              + struct.pack('<L', genno)[:2] + b'sAlT'
468        hash = md5(key)
469        key = hash.digest()[:min(len(key), 16)]
470        initialization_vector = data[:16]
471        ciphertext = data[16:]
472        cipher = Cipher(algorithms.AES(key),
473                        modes.CBC(initialization_vector),
474                        backend=default_backend())
475        return cipher.decryptor().update(ciphertext)
476
477
478class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
479
480    supported_revisions = (5,)
481
482    def init_params(self):
483        super().init_params()
484        self.length = 256
485        self.oe = str_value(self.param['OE'])
486        self.ue = str_value(self.param['UE'])
487        self.o_hash = self.o[:32]
488        self.o_validation_salt = self.o[32:40]
489        self.o_key_salt = self.o[40:]
490        self.u_hash = self.u[:32]
491        self.u_validation_salt = self.u[32:40]
492        self.u_key_salt = self.u[40:]
493        return
494
495    def get_cfm(self, name):
496        if name == 'AESV3':
497            return self.decrypt_aes256
498        else:
499            return None
500
501    def authenticate(self, password):
502        password = password.encode('utf-8')[:127]
503        hash = sha256(password)
504        hash.update(self.o_validation_salt)
505        hash.update(self.u)
506        if hash.digest() == self.o_hash:
507            hash = sha256(password)
508            hash.update(self.o_key_salt)
509            hash.update(self.u)
510            cipher = Cipher(algorithms.AES(hash.digest()),
511                            modes.CBC(b'\0' * 16),
512                            backend=default_backend())
513            return cipher.decryptor().update(self.oe)
514        hash = sha256(password)
515        hash.update(self.u_validation_salt)
516        if hash.digest() == self.u_hash:
517            hash = sha256(password)
518            hash.update(self.u_key_salt)
519            cipher = Cipher(algorithms.AES(hash.digest()),
520                            modes.CBC(b'\0' * 16),
521                            backend=default_backend())
522            return cipher.decryptor().update(self.ue)
523        return None
524
525    def decrypt_aes256(self, objid, genno, data):
526        initialization_vector = data[:16]
527        ciphertext = data[16:]
528        cipher = Cipher(algorithms.AES(self.key),
529                        modes.CBC(initialization_vector),
530                        backend=default_backend())
531        return cipher.decryptor().update(ciphertext)
532
533
534class PDFDocument:
535    """PDFDocument object represents a PDF document.
536
537    Since a PDF file can be very big, normally it is not loaded at
538    once. So PDF document has to cooperate with a PDF parser in order to
539    dynamically import the data as processing goes.
540
541    Typical usage:
542      doc = PDFDocument(parser, password)
543      obj = doc.getobj(objid)
544
545    """
546
547    security_handler_registry = {
548        1: PDFStandardSecurityHandler,
549        2: PDFStandardSecurityHandler,
550        4: PDFStandardSecurityHandlerV4,
551        5: PDFStandardSecurityHandlerV5,
552    }
553
554    def __init__(self, parser, password='', caching=True, fallback=True):
555        "Set the document to use a given PDFParser object."
556        self.caching = caching
557        self.xrefs = []
558        self.info = []
559        self.catalog = None
560        self.encryption = None
561        self.decipher = None
562        self._parser = None
563        self._cached_objs = {}
564        self._parsed_objs = {}
565        self._parser = parser
566        self._parser.set_document(self)
567        self.is_printable = self.is_modifiable = self.is_extractable = True
568        # Retrieve the information of each header that was appended
569        # (maybe multiple times) at the end of the document.
570        try:
571            pos = self.find_xref(parser)
572            self.read_xref_from(parser, pos, self.xrefs)
573        except PDFNoValidXRef:
574            pass  # fallback = True
575        if fallback:
576            parser.fallback = True
577            xref = PDFXRefFallback()
578            xref.load(parser)
579            self.xrefs.append(xref)
580        for xref in self.xrefs:
581            trailer = xref.get_trailer()
582            if not trailer:
583                continue
584            # If there's an encryption info, remember it.
585            if 'Encrypt' in trailer:
586                self.encryption = (list_value(trailer['ID']),
587                                   dict_value(trailer['Encrypt']))
588                self._initialize_password(password)
589            if 'Info' in trailer:
590                self.info.append(dict_value(trailer['Info']))
591            if 'Root' in trailer:
592                # Every PDF file must have exactly one /Root dictionary.
593                self.catalog = dict_value(trailer['Root'])
594                break
595        else:
596            raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
597        if self.catalog.get('Type') is not LITERAL_CATALOG:
598            if settings.STRICT:
599                raise PDFSyntaxError('Catalog not found!')
600        return
601
602    KEYWORD_OBJ = KWD(b'obj')
603
604    # _initialize_password(password=b'')
605    #   Perform the initialization with a given password.
606    def _initialize_password(self, password=''):
607        (docid, param) = self.encryption
608        if literal_name(param.get('Filter')) != 'Standard':
609            raise PDFEncryptionError('Unknown filter: param=%r' % param)
610        v = int_value(param.get('V', 0))
611        factory = self.security_handler_registry.get(v)
612        if factory is None:
613            raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
614        handler = factory(docid, param, password)
615        self.decipher = handler.decrypt
616        self.is_printable = handler.is_printable()
617        self.is_modifiable = handler.is_modifiable()
618        self.is_extractable = handler.is_extractable()
619        self._parser.fallback = False  # need to read streams with exact length
620        return
621
622    def _getobj_objstm(self, stream, index, objid):
623        if stream.objid in self._parsed_objs:
624            (objs, n) = self._parsed_objs[stream.objid]
625        else:
626            (objs, n) = self._get_objects(stream)
627            if self.caching:
628                self._parsed_objs[stream.objid] = (objs, n)
629        i = n*2+index
630        try:
631            obj = objs[i]
632        except IndexError:
633            raise PDFSyntaxError('index too big: %r' % index)
634        return obj
635
636    def _get_objects(self, stream):
637        if stream.get('Type') is not LITERAL_OBJSTM:
638            if settings.STRICT:
639                raise PDFSyntaxError('Not a stream object: %r' % stream)
640        try:
641            n = stream['N']
642        except KeyError:
643            if settings.STRICT:
644                raise PDFSyntaxError('N is not defined: %r' % stream)
645            n = 0
646        parser = PDFStreamParser(stream.get_data())
647        parser.set_document(self)
648        objs = []
649        try:
650            while 1:
651                (_, obj) = parser.nextobject()
652                objs.append(obj)
653        except PSEOF:
654            pass
655        return (objs, n)
656
657    def _getobj_parse(self, pos, objid):
658        self._parser.seek(pos)
659        (_, objid1) = self._parser.nexttoken()  # objid
660        (_, genno) = self._parser.nexttoken()  # genno
661        (_, kwd) = self._parser.nexttoken()
662        # hack around malformed pdf files
663        # copied from https://github.com/jaepil/pdfminer3k/blob/master/
664        # pdfminer/pdfparser.py#L399
665        # to solve https://github.com/pdfminer/pdfminer.six/issues/56
666        # assert objid1 == objid, str((objid1, objid))
667        if objid1 != objid:
668            x = []
669            while kwd is not self.KEYWORD_OBJ:
670                (_, kwd) = self._parser.nexttoken()
671                x.append(kwd)
672            if len(x) >= 2:
673                objid1 = x[-2]
674        # #### end hack around malformed pdf files
675        if objid1 != objid:
676            raise PDFSyntaxError('objid mismatch: {!r}={!r}'
677                                 .format(objid1, objid))
678
679        if kwd != KWD(b'obj'):
680            raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
681        (_, obj) = self._parser.nextobject()
682        return obj
683
684    # can raise PDFObjectNotFound
685    def getobj(self, objid):
686        """Get object from PDF
687
688        :raises PDFException if PDFDocument is not initialized
689        :raises PDFObjectNotFound if objid does not exist in PDF
690        """
691        if not self.xrefs:
692            raise PDFException('PDFDocument is not initialized')
693        log.debug('getobj: objid=%r', objid)
694        if objid in self._cached_objs:
695            (obj, genno) = self._cached_objs[objid]
696        else:
697            for xref in self.xrefs:
698                try:
699                    (strmid, index, genno) = xref.get_pos(objid)
700                except KeyError:
701                    continue
702                try:
703                    if strmid is not None:
704                        stream = stream_value(self.getobj(strmid))
705                        obj = self._getobj_objstm(stream, index, objid)
706                    else:
707                        obj = self._getobj_parse(index, objid)
708                        if self.decipher:
709                            obj = decipher_all(self.decipher, objid, genno,
710                                               obj)
711
712                    if isinstance(obj, PDFStream):
713                        obj.set_objid(objid, genno)
714                    break
715                except (PSEOF, PDFSyntaxError):
716                    continue
717            else:
718                raise PDFObjectNotFound(objid)
719            log.debug('register: objid=%r: %r', objid, obj)
720            if self.caching:
721                self._cached_objs[objid] = (obj, genno)
722        return obj
723
724    def get_outlines(self):
725        if 'Outlines' not in self.catalog:
726            raise PDFNoOutlines
727
728        def search(entry, level):
729            entry = dict_value(entry)
730            if 'Title' in entry:
731                if 'A' in entry or 'Dest' in entry:
732                    title = decode_text(str_value(entry['Title']))
733                    dest = entry.get('Dest')
734                    action = entry.get('A')
735                    se = entry.get('SE')
736                    yield (level, title, dest, action, se)
737            if 'First' in entry and 'Last' in entry:
738                yield from search(entry['First'], level+1)
739            if 'Next' in entry:
740                yield from search(entry['Next'], level)
741            return
742        return search(self.catalog['Outlines'], 0)
743
744    def lookup_name(self, cat, key):
745        try:
746            names = dict_value(self.catalog['Names'])
747        except (PDFTypeError, KeyError):
748            raise KeyError((cat, key))
749        # may raise KeyError
750        d0 = dict_value(names[cat])
751
752        def lookup(d):
753            if 'Limits' in d:
754                (k1, k2) = list_value(d['Limits'])
755                if key < k1 or k2 < key:
756                    return None
757            if 'Names' in d:
758                objs = list_value(d['Names'])
759                names = dict(choplist(2, objs))
760                return names[key]
761            if 'Kids' in d:
762                for c in list_value(d['Kids']):
763                    v = lookup(dict_value(c))
764                    if v:
765                        return v
766            raise KeyError((cat, key))
767        return lookup(d0)
768
769    def get_dest(self, name):
770        try:
771            # PDF-1.2 or later
772            obj = self.lookup_name('Dests', name)
773        except KeyError:
774            # PDF-1.1 or prior
775            if 'Dests' not in self.catalog:
776                raise PDFDestinationNotFound(name)
777            d0 = dict_value(self.catalog['Dests'])
778            if name not in d0:
779                raise PDFDestinationNotFound(name)
780            obj = d0[name]
781        return obj
782
783    # find_xref
784    def find_xref(self, parser):
785        """Internal function used to locate the first XRef."""
786        # search the last xref table by scanning the file backwards.
787        prev = None
788        for line in parser.revreadlines():
789            line = line.strip()
790            log.debug('find_xref: %r', line)
791            if line == b'startxref':
792                break
793            if line:
794                prev = line
795        else:
796            raise PDFNoValidXRef('Unexpected EOF')
797        log.info('xref found: pos=%r', prev)
798        return int(prev)
799
800    # read xref table
801    def read_xref_from(self, parser, start, xrefs):
802        """Reads XRefs from the given location."""
803        parser.seek(start)
804        parser.reset()
805        try:
806            (pos, token) = parser.nexttoken()
807        except PSEOF:
808            raise PDFNoValidXRef('Unexpected EOF')
809        log.info('read_xref_from: start=%d, token=%r', start, token)
810        if isinstance(token, int):
811            # XRefStream: PDF-1.5
812            parser.seek(pos)
813            parser.reset()
814            xref = PDFXRefStream()
815            xref.load(parser)
816        else:
817            if token is parser.KEYWORD_XREF:
818                parser.nextline()
819            xref = PDFXRef()
820            xref.load(parser)
821        xrefs.append(xref)
822        trailer = xref.get_trailer()
823        log.info('trailer: %r', trailer)
824        if 'XRefStm' in trailer:
825            pos = int_value(trailer['XRefStm'])
826            self.read_xref_from(parser, pos, xrefs)
827        if 'Prev' in trailer:
828            # find previous xref
829            pos = int_value(trailer['Prev'])
830            self.read_xref_from(parser, pos, xrefs)
831        return
832