1import logging 2import re 3import struct 4from hashlib import sha256, md5 5 6from cryptography.hazmat.backends import default_backend 7from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes 8 9from . import settings 10from .arcfour import Arcfour 11from .pdfparser import PDFSyntaxError, PDFStreamParser 12from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \ 13 PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ 14 dict_value, stream_value 15from .psparser import PSEOF, literal_name, LIT, KWD 16from .utils import choplist, nunpack, decode_text 17 18log = logging.getLogger(__name__) 19 20 21class PDFNoValidXRef(PDFSyntaxError): 22 pass 23 24 25class PDFNoValidXRefWarning(SyntaxWarning): 26 pass 27 28 29class PDFNoOutlines(PDFException): 30 pass 31 32 33class PDFDestinationNotFound(PDFException): 34 pass 35 36 37class PDFEncryptionError(PDFException): 38 pass 39 40 41class PDFPasswordIncorrect(PDFEncryptionError): 42 pass 43 44 45class PDFTextExtractionNotAllowedWarning(UserWarning): 46 pass 47 48 49class PDFTextExtractionNotAllowed(PDFEncryptionError): 50 pass 51 52 53class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed): 54 def __init__(self, *args): 55 from warnings import warn 56 warn('PDFTextExtractionNotAllowedError will be removed in the future. ' 57 'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning) 58 super().__init__(*args) 59 60 61# some predefined literals and keywords. 62LITERAL_OBJSTM = LIT('ObjStm') 63LITERAL_XREF = LIT('XRef') 64LITERAL_CATALOG = LIT('Catalog') 65 66 67class PDFBaseXRef: 68 69 def get_trailer(self): 70 raise NotImplementedError 71 72 def get_objids(self): 73 return [] 74 75 # Must return 76 # (strmid, index, genno) 77 # or (None, pos, genno) 78 def get_pos(self, objid): 79 raise KeyError(objid) 80 81 82class PDFXRef(PDFBaseXRef): 83 84 def __init__(self): 85 self.offsets = {} 86 self.trailer = {} 87 return 88 89 def __repr__(self): 90 return '<PDFXRef: offsets=%r>' % (self.offsets.keys()) 91 92 def load(self, parser): 93 while True: 94 try: 95 (pos, line) = parser.nextline() 96 if not line.strip(): 97 continue 98 except PSEOF: 99 raise PDFNoValidXRef('Unexpected EOF - file corrupted?') 100 if not line: 101 raise PDFNoValidXRef('Premature eof: %r' % parser) 102 if line.startswith(b'trailer'): 103 parser.seek(pos) 104 break 105 f = line.strip().split(b' ') 106 if len(f) != 2: 107 error_msg = 'Trailer not found: {!r}: line={!r}'\ 108 .format(parser, line) 109 raise PDFNoValidXRef(error_msg) 110 try: 111 (start, nobjs) = map(int, f) 112 except ValueError: 113 error_msg = 'Invalid line: {!r}: line={!r}'\ 114 .format(parser, line) 115 raise PDFNoValidXRef(error_msg) 116 for objid in range(start, start+nobjs): 117 try: 118 (_, line) = parser.nextline() 119 except PSEOF: 120 raise PDFNoValidXRef('Unexpected EOF - file corrupted?') 121 f = line.strip().split(b' ') 122 if len(f) != 3: 123 error_msg = 'Invalid XRef format: {!r}, line={!r}'\ 124 .format(parser, line) 125 raise PDFNoValidXRef(error_msg) 126 (pos, genno, use) = f 127 if use != b'n': 128 continue 129 self.offsets[objid] = (None, int(pos), int(genno)) 130 log.info('xref objects: %r', self.offsets) 131 self.load_trailer(parser) 132 return 133 134 def load_trailer(self, parser): 135 try: 136 (_, kwd) = parser.nexttoken() 137 assert kwd is KWD(b'trailer'), str(kwd) 138 (_, dic) = parser.nextobject() 139 except PSEOF: 140 x = parser.pop(1) 141 if not x: 142 raise PDFNoValidXRef('Unexpected EOF - file corrupted') 143 (_, dic) = x[0] 144 self.trailer.update(dict_value(dic)) 145 log.debug('trailer=%r', self.trailer) 146 return 147 148 def get_trailer(self): 149 return self.trailer 150 151 def get_objids(self): 152 return self.offsets.keys() 153 154 def get_pos(self, objid): 155 try: 156 return self.offsets[objid] 157 except KeyError: 158 raise 159 160 161class PDFXRefFallback(PDFXRef): 162 163 def __repr__(self): 164 return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys()) 165 166 PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') 167 168 def load(self, parser): 169 parser.seek(0) 170 while 1: 171 try: 172 (pos, line) = parser.nextline() 173 except PSEOF: 174 break 175 if line.startswith(b'trailer'): 176 parser.seek(pos) 177 self.load_trailer(parser) 178 log.info('trailer: %r', self.trailer) 179 break 180 line = line.decode('latin-1') # default pdf encoding 181 m = self.PDFOBJ_CUE.match(line) 182 if not m: 183 continue 184 (objid, genno) = m.groups() 185 objid = int(objid) 186 genno = int(genno) 187 self.offsets[objid] = (None, pos, genno) 188 # expand ObjStm. 189 parser.seek(pos) 190 (_, obj) = parser.nextobject() 191 if isinstance(obj, PDFStream) \ 192 and obj.get('Type') is LITERAL_OBJSTM: 193 stream = stream_value(obj) 194 try: 195 n = stream['N'] 196 except KeyError: 197 if settings.STRICT: 198 raise PDFSyntaxError('N is not defined: %r' % stream) 199 n = 0 200 parser1 = PDFStreamParser(stream.get_data()) 201 objs = [] 202 try: 203 while 1: 204 (_, obj) = parser1.nextobject() 205 objs.append(obj) 206 except PSEOF: 207 pass 208 n = min(n, len(objs)//2) 209 for index in range(n): 210 objid1 = objs[index*2] 211 self.offsets[objid1] = (objid, index, 0) 212 return 213 214 215class PDFXRefStream(PDFBaseXRef): 216 217 def __init__(self): 218 self.data = None 219 self.entlen = None 220 self.fl1 = self.fl2 = self.fl3 = None 221 self.ranges = [] 222 return 223 224 def __repr__(self): 225 return '<PDFXRefStream: ranges=%r>' % (self.ranges) 226 227 def load(self, parser): 228 (_, objid) = parser.nexttoken() # ignored 229 (_, genno) = parser.nexttoken() # ignored 230 (_, kwd) = parser.nexttoken() 231 (_, stream) = parser.nextobject() 232 if not isinstance(stream, PDFStream) \ 233 or stream['Type'] is not LITERAL_XREF: 234 raise PDFNoValidXRef('Invalid PDF stream spec.') 235 size = stream['Size'] 236 index_array = stream.get('Index', (0, size)) 237 if len(index_array) % 2 != 0: 238 raise PDFSyntaxError('Invalid index number') 239 self.ranges.extend(choplist(2, index_array)) 240 (self.fl1, self.fl2, self.fl3) = stream['W'] 241 self.data = stream.get_data() 242 self.entlen = self.fl1+self.fl2+self.fl3 243 self.trailer = stream.attrs 244 log.info('xref stream: objid=%s, fields=%d,%d,%d', 245 ', '.join(map(repr, self.ranges)), 246 self.fl1, self.fl2, self.fl3) 247 return 248 249 def get_trailer(self): 250 return self.trailer 251 252 def get_objids(self): 253 for (start, nobjs) in self.ranges: 254 for i in range(nobjs): 255 offset = self.entlen * i 256 ent = self.data[offset:offset+self.entlen] 257 f1 = nunpack(ent[:self.fl1], 1) 258 if f1 == 1 or f1 == 2: 259 yield start+i 260 return 261 262 def get_pos(self, objid): 263 index = 0 264 for (start, nobjs) in self.ranges: 265 if start <= objid and objid < start+nobjs: 266 index += objid - start 267 break 268 else: 269 index += nobjs 270 else: 271 raise KeyError(objid) 272 offset = self.entlen * index 273 ent = self.data[offset:offset+self.entlen] 274 f1 = nunpack(ent[:self.fl1], 1) 275 f2 = nunpack(ent[self.fl1:self.fl1+self.fl2]) 276 f3 = nunpack(ent[self.fl1+self.fl2:]) 277 if f1 == 1: 278 return (None, f2, f3) 279 elif f1 == 2: 280 return (f2, f3, 0) 281 else: 282 # this is a free object 283 raise KeyError(objid) 284 285 286class PDFStandardSecurityHandler: 287 288 PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' 289 b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') 290 supported_revisions = (2, 3) 291 292 def __init__(self, docid, param, password=''): 293 self.docid = docid 294 self.param = param 295 self.password = password 296 self.init() 297 return 298 299 def init(self): 300 self.init_params() 301 if self.r not in self.supported_revisions: 302 error_msg = 'Unsupported revision: param=%r' % self.param 303 raise PDFEncryptionError(error_msg) 304 self.init_key() 305 return 306 307 def init_params(self): 308 self.v = int_value(self.param.get('V', 0)) 309 self.r = int_value(self.param['R']) 310 self.p = uint_value(self.param['P'], 32) 311 self.o = str_value(self.param['O']) 312 self.u = str_value(self.param['U']) 313 self.length = int_value(self.param.get('Length', 40)) 314 return 315 316 def init_key(self): 317 self.key = self.authenticate(self.password) 318 if self.key is None: 319 raise PDFPasswordIncorrect 320 return 321 322 def is_printable(self): 323 return bool(self.p & 4) 324 325 def is_modifiable(self): 326 return bool(self.p & 8) 327 328 def is_extractable(self): 329 return bool(self.p & 16) 330 331 def compute_u(self, key): 332 if self.r == 2: 333 # Algorithm 3.4 334 return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 335 else: 336 # Algorithm 3.5 337 hash = md5(self.PASSWORD_PADDING) # 2 338 hash.update(self.docid[0]) # 3 339 result = Arcfour(key).encrypt(hash.digest()) # 4 340 for i in range(1, 20): # 5 341 k = b''.join(bytes((c ^ i,)) for c in iter(key)) 342 result = Arcfour(k).encrypt(result) 343 result += result # 6 344 return result 345 346 def compute_encryption_key(self, password): 347 # Algorithm 3.2 348 password = (password + self.PASSWORD_PADDING)[:32] # 1 349 hash = md5(password) # 2 350 hash.update(self.o) # 3 351 # See https://github.com/pdfminer/pdfminer.six/issues/186 352 hash.update(struct.pack('<L', self.p)) # 4 353 hash.update(self.docid[0]) # 5 354 if self.r >= 4: 355 if not self.encrypt_metadata: 356 hash.update(b'\xff\xff\xff\xff') 357 result = hash.digest() 358 n = 5 359 if self.r >= 3: 360 n = self.length // 8 361 for _ in range(50): 362 result = md5(result[:n]).digest() 363 return result[:n] 364 365 def authenticate(self, password): 366 password = password.encode("latin1") 367 key = self.authenticate_user_password(password) 368 if key is None: 369 key = self.authenticate_owner_password(password) 370 return key 371 372 def authenticate_user_password(self, password): 373 key = self.compute_encryption_key(password) 374 if self.verify_encryption_key(key): 375 return key 376 else: 377 return None 378 379 def verify_encryption_key(self, key): 380 # Algorithm 3.6 381 u = self.compute_u(key) 382 if self.r == 2: 383 return u == self.u 384 return u[:16] == self.u[:16] 385 386 def authenticate_owner_password(self, password): 387 # Algorithm 3.7 388 password = (password + self.PASSWORD_PADDING)[:32] 389 hash = md5(password) 390 if self.r >= 3: 391 for _ in range(50): 392 hash = md5(hash.digest()) 393 n = 5 394 if self.r >= 3: 395 n = self.length // 8 396 key = hash.digest()[:n] 397 if self.r == 2: 398 user_password = Arcfour(key).decrypt(self.o) 399 else: 400 user_password = self.o 401 for i in range(19, -1, -1): 402 k = b''.join(bytes((c ^ i,)) for c in iter(key)) 403 user_password = Arcfour(k).decrypt(user_password) 404 return self.authenticate_user_password(user_password) 405 406 def decrypt(self, objid, genno, data, attrs=None): 407 return self.decrypt_rc4(objid, genno, data) 408 409 def decrypt_rc4(self, objid, genno, data): 410 key = self.key + struct.pack('<L', objid)[:3] \ 411 + struct.pack('<L', genno)[:2] 412 hash = md5(key) 413 key = hash.digest()[:min(len(key), 16)] 414 return Arcfour(key).decrypt(data) 415 416 417class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler): 418 419 supported_revisions = (4,) 420 421 def init_params(self): 422 super().init_params() 423 self.length = 128 424 self.cf = dict_value(self.param.get('CF')) 425 self.stmf = literal_name(self.param['StmF']) 426 self.strf = literal_name(self.param['StrF']) 427 self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True)) 428 if self.stmf != self.strf: 429 error_msg = 'Unsupported crypt filter: param=%r' % self.param 430 raise PDFEncryptionError(error_msg) 431 self.cfm = {} 432 for k, v in self.cf.items(): 433 f = self.get_cfm(literal_name(v['CFM'])) 434 if f is None: 435 error_msg = 'Unknown crypt filter method: param=%r' \ 436 % self.param 437 raise PDFEncryptionError(error_msg) 438 self.cfm[k] = f 439 self.cfm['Identity'] = self.decrypt_identity 440 if self.strf not in self.cfm: 441 error_msg = 'Undefined crypt filter: param=%r' % self.param 442 raise PDFEncryptionError(error_msg) 443 return 444 445 def get_cfm(self, name): 446 if name == 'V2': 447 return self.decrypt_rc4 448 elif name == 'AESV2': 449 return self.decrypt_aes128 450 else: 451 return None 452 453 def decrypt(self, objid, genno, data, attrs=None, name=None): 454 if not self.encrypt_metadata and attrs is not None: 455 t = attrs.get('Type') 456 if t is not None and literal_name(t) == 'Metadata': 457 return data 458 if name is None: 459 name = self.strf 460 return self.cfm[name](objid, genno, data) 461 462 def decrypt_identity(self, objid, genno, data): 463 return data 464 465 def decrypt_aes128(self, objid, genno, data): 466 key = self.key + struct.pack('<L', objid)[:3] \ 467 + struct.pack('<L', genno)[:2] + b'sAlT' 468 hash = md5(key) 469 key = hash.digest()[:min(len(key), 16)] 470 initialization_vector = data[:16] 471 ciphertext = data[16:] 472 cipher = Cipher(algorithms.AES(key), 473 modes.CBC(initialization_vector), 474 backend=default_backend()) 475 return cipher.decryptor().update(ciphertext) 476 477 478class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4): 479 480 supported_revisions = (5,) 481 482 def init_params(self): 483 super().init_params() 484 self.length = 256 485 self.oe = str_value(self.param['OE']) 486 self.ue = str_value(self.param['UE']) 487 self.o_hash = self.o[:32] 488 self.o_validation_salt = self.o[32:40] 489 self.o_key_salt = self.o[40:] 490 self.u_hash = self.u[:32] 491 self.u_validation_salt = self.u[32:40] 492 self.u_key_salt = self.u[40:] 493 return 494 495 def get_cfm(self, name): 496 if name == 'AESV3': 497 return self.decrypt_aes256 498 else: 499 return None 500 501 def authenticate(self, password): 502 password = password.encode('utf-8')[:127] 503 hash = sha256(password) 504 hash.update(self.o_validation_salt) 505 hash.update(self.u) 506 if hash.digest() == self.o_hash: 507 hash = sha256(password) 508 hash.update(self.o_key_salt) 509 hash.update(self.u) 510 cipher = Cipher(algorithms.AES(hash.digest()), 511 modes.CBC(b'\0' * 16), 512 backend=default_backend()) 513 return cipher.decryptor().update(self.oe) 514 hash = sha256(password) 515 hash.update(self.u_validation_salt) 516 if hash.digest() == self.u_hash: 517 hash = sha256(password) 518 hash.update(self.u_key_salt) 519 cipher = Cipher(algorithms.AES(hash.digest()), 520 modes.CBC(b'\0' * 16), 521 backend=default_backend()) 522 return cipher.decryptor().update(self.ue) 523 return None 524 525 def decrypt_aes256(self, objid, genno, data): 526 initialization_vector = data[:16] 527 ciphertext = data[16:] 528 cipher = Cipher(algorithms.AES(self.key), 529 modes.CBC(initialization_vector), 530 backend=default_backend()) 531 return cipher.decryptor().update(ciphertext) 532 533 534class PDFDocument: 535 """PDFDocument object represents a PDF document. 536 537 Since a PDF file can be very big, normally it is not loaded at 538 once. So PDF document has to cooperate with a PDF parser in order to 539 dynamically import the data as processing goes. 540 541 Typical usage: 542 doc = PDFDocument(parser, password) 543 obj = doc.getobj(objid) 544 545 """ 546 547 security_handler_registry = { 548 1: PDFStandardSecurityHandler, 549 2: PDFStandardSecurityHandler, 550 4: PDFStandardSecurityHandlerV4, 551 5: PDFStandardSecurityHandlerV5, 552 } 553 554 def __init__(self, parser, password='', caching=True, fallback=True): 555 "Set the document to use a given PDFParser object." 556 self.caching = caching 557 self.xrefs = [] 558 self.info = [] 559 self.catalog = None 560 self.encryption = None 561 self.decipher = None 562 self._parser = None 563 self._cached_objs = {} 564 self._parsed_objs = {} 565 self._parser = parser 566 self._parser.set_document(self) 567 self.is_printable = self.is_modifiable = self.is_extractable = True 568 # Retrieve the information of each header that was appended 569 # (maybe multiple times) at the end of the document. 570 try: 571 pos = self.find_xref(parser) 572 self.read_xref_from(parser, pos, self.xrefs) 573 except PDFNoValidXRef: 574 pass # fallback = True 575 if fallback: 576 parser.fallback = True 577 xref = PDFXRefFallback() 578 xref.load(parser) 579 self.xrefs.append(xref) 580 for xref in self.xrefs: 581 trailer = xref.get_trailer() 582 if not trailer: 583 continue 584 # If there's an encryption info, remember it. 585 if 'Encrypt' in trailer: 586 self.encryption = (list_value(trailer['ID']), 587 dict_value(trailer['Encrypt'])) 588 self._initialize_password(password) 589 if 'Info' in trailer: 590 self.info.append(dict_value(trailer['Info'])) 591 if 'Root' in trailer: 592 # Every PDF file must have exactly one /Root dictionary. 593 self.catalog = dict_value(trailer['Root']) 594 break 595 else: 596 raise PDFSyntaxError('No /Root object! - Is this really a PDF?') 597 if self.catalog.get('Type') is not LITERAL_CATALOG: 598 if settings.STRICT: 599 raise PDFSyntaxError('Catalog not found!') 600 return 601 602 KEYWORD_OBJ = KWD(b'obj') 603 604 # _initialize_password(password=b'') 605 # Perform the initialization with a given password. 606 def _initialize_password(self, password=''): 607 (docid, param) = self.encryption 608 if literal_name(param.get('Filter')) != 'Standard': 609 raise PDFEncryptionError('Unknown filter: param=%r' % param) 610 v = int_value(param.get('V', 0)) 611 factory = self.security_handler_registry.get(v) 612 if factory is None: 613 raise PDFEncryptionError('Unknown algorithm: param=%r' % param) 614 handler = factory(docid, param, password) 615 self.decipher = handler.decrypt 616 self.is_printable = handler.is_printable() 617 self.is_modifiable = handler.is_modifiable() 618 self.is_extractable = handler.is_extractable() 619 self._parser.fallback = False # need to read streams with exact length 620 return 621 622 def _getobj_objstm(self, stream, index, objid): 623 if stream.objid in self._parsed_objs: 624 (objs, n) = self._parsed_objs[stream.objid] 625 else: 626 (objs, n) = self._get_objects(stream) 627 if self.caching: 628 self._parsed_objs[stream.objid] = (objs, n) 629 i = n*2+index 630 try: 631 obj = objs[i] 632 except IndexError: 633 raise PDFSyntaxError('index too big: %r' % index) 634 return obj 635 636 def _get_objects(self, stream): 637 if stream.get('Type') is not LITERAL_OBJSTM: 638 if settings.STRICT: 639 raise PDFSyntaxError('Not a stream object: %r' % stream) 640 try: 641 n = stream['N'] 642 except KeyError: 643 if settings.STRICT: 644 raise PDFSyntaxError('N is not defined: %r' % stream) 645 n = 0 646 parser = PDFStreamParser(stream.get_data()) 647 parser.set_document(self) 648 objs = [] 649 try: 650 while 1: 651 (_, obj) = parser.nextobject() 652 objs.append(obj) 653 except PSEOF: 654 pass 655 return (objs, n) 656 657 def _getobj_parse(self, pos, objid): 658 self._parser.seek(pos) 659 (_, objid1) = self._parser.nexttoken() # objid 660 (_, genno) = self._parser.nexttoken() # genno 661 (_, kwd) = self._parser.nexttoken() 662 # hack around malformed pdf files 663 # copied from https://github.com/jaepil/pdfminer3k/blob/master/ 664 # pdfminer/pdfparser.py#L399 665 # to solve https://github.com/pdfminer/pdfminer.six/issues/56 666 # assert objid1 == objid, str((objid1, objid)) 667 if objid1 != objid: 668 x = [] 669 while kwd is not self.KEYWORD_OBJ: 670 (_, kwd) = self._parser.nexttoken() 671 x.append(kwd) 672 if len(x) >= 2: 673 objid1 = x[-2] 674 # #### end hack around malformed pdf files 675 if objid1 != objid: 676 raise PDFSyntaxError('objid mismatch: {!r}={!r}' 677 .format(objid1, objid)) 678 679 if kwd != KWD(b'obj'): 680 raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) 681 (_, obj) = self._parser.nextobject() 682 return obj 683 684 # can raise PDFObjectNotFound 685 def getobj(self, objid): 686 """Get object from PDF 687 688 :raises PDFException if PDFDocument is not initialized 689 :raises PDFObjectNotFound if objid does not exist in PDF 690 """ 691 if not self.xrefs: 692 raise PDFException('PDFDocument is not initialized') 693 log.debug('getobj: objid=%r', objid) 694 if objid in self._cached_objs: 695 (obj, genno) = self._cached_objs[objid] 696 else: 697 for xref in self.xrefs: 698 try: 699 (strmid, index, genno) = xref.get_pos(objid) 700 except KeyError: 701 continue 702 try: 703 if strmid is not None: 704 stream = stream_value(self.getobj(strmid)) 705 obj = self._getobj_objstm(stream, index, objid) 706 else: 707 obj = self._getobj_parse(index, objid) 708 if self.decipher: 709 obj = decipher_all(self.decipher, objid, genno, 710 obj) 711 712 if isinstance(obj, PDFStream): 713 obj.set_objid(objid, genno) 714 break 715 except (PSEOF, PDFSyntaxError): 716 continue 717 else: 718 raise PDFObjectNotFound(objid) 719 log.debug('register: objid=%r: %r', objid, obj) 720 if self.caching: 721 self._cached_objs[objid] = (obj, genno) 722 return obj 723 724 def get_outlines(self): 725 if 'Outlines' not in self.catalog: 726 raise PDFNoOutlines 727 728 def search(entry, level): 729 entry = dict_value(entry) 730 if 'Title' in entry: 731 if 'A' in entry or 'Dest' in entry: 732 title = decode_text(str_value(entry['Title'])) 733 dest = entry.get('Dest') 734 action = entry.get('A') 735 se = entry.get('SE') 736 yield (level, title, dest, action, se) 737 if 'First' in entry and 'Last' in entry: 738 yield from search(entry['First'], level+1) 739 if 'Next' in entry: 740 yield from search(entry['Next'], level) 741 return 742 return search(self.catalog['Outlines'], 0) 743 744 def lookup_name(self, cat, key): 745 try: 746 names = dict_value(self.catalog['Names']) 747 except (PDFTypeError, KeyError): 748 raise KeyError((cat, key)) 749 # may raise KeyError 750 d0 = dict_value(names[cat]) 751 752 def lookup(d): 753 if 'Limits' in d: 754 (k1, k2) = list_value(d['Limits']) 755 if key < k1 or k2 < key: 756 return None 757 if 'Names' in d: 758 objs = list_value(d['Names']) 759 names = dict(choplist(2, objs)) 760 return names[key] 761 if 'Kids' in d: 762 for c in list_value(d['Kids']): 763 v = lookup(dict_value(c)) 764 if v: 765 return v 766 raise KeyError((cat, key)) 767 return lookup(d0) 768 769 def get_dest(self, name): 770 try: 771 # PDF-1.2 or later 772 obj = self.lookup_name('Dests', name) 773 except KeyError: 774 # PDF-1.1 or prior 775 if 'Dests' not in self.catalog: 776 raise PDFDestinationNotFound(name) 777 d0 = dict_value(self.catalog['Dests']) 778 if name not in d0: 779 raise PDFDestinationNotFound(name) 780 obj = d0[name] 781 return obj 782 783 # find_xref 784 def find_xref(self, parser): 785 """Internal function used to locate the first XRef.""" 786 # search the last xref table by scanning the file backwards. 787 prev = None 788 for line in parser.revreadlines(): 789 line = line.strip() 790 log.debug('find_xref: %r', line) 791 if line == b'startxref': 792 break 793 if line: 794 prev = line 795 else: 796 raise PDFNoValidXRef('Unexpected EOF') 797 log.info('xref found: pos=%r', prev) 798 return int(prev) 799 800 # read xref table 801 def read_xref_from(self, parser, start, xrefs): 802 """Reads XRefs from the given location.""" 803 parser.seek(start) 804 parser.reset() 805 try: 806 (pos, token) = parser.nexttoken() 807 except PSEOF: 808 raise PDFNoValidXRef('Unexpected EOF') 809 log.info('read_xref_from: start=%d, token=%r', start, token) 810 if isinstance(token, int): 811 # XRefStream: PDF-1.5 812 parser.seek(pos) 813 parser.reset() 814 xref = PDFXRefStream() 815 xref.load(parser) 816 else: 817 if token is parser.KEYWORD_XREF: 818 parser.nextline() 819 xref = PDFXRef() 820 xref.load(parser) 821 xrefs.append(xref) 822 trailer = xref.get_trailer() 823 log.info('trailer: %r', trailer) 824 if 'XRefStm' in trailer: 825 pos = int_value(trailer['XRefStm']) 826 self.read_xref_from(parser, pos, xrefs) 827 if 'Prev' in trailer: 828 # find previous xref 829 pos = int_value(trailer['Prev']) 830 self.read_xref_from(parser, pos, xrefs) 831 return 832