1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3# Copyright (C) 2012-2015 Nerijus Mika 4# MIT license -- See LICENSE.txt for details 5 6''' 7The PdfReader class reads an entire PDF file into memory and 8parses the top-level container objects. (It does not parse 9into streams.) The object subclasses PdfDict, and the 10document pages are stored in a list in the pages attribute 11of the object. 12''' 13import gc 14import binascii 15import collections 16import itertools 17 18from .errors import PdfParseError, log 19from .tokens import PdfTokens 20from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect 21from .uncompress import uncompress 22from . import crypt 23from .py23_diffs import convert_load, convert_store, iteritems 24 25 26class PdfReader(PdfDict): 27 28 def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int): 29 ''' Return a previously loaded indirect object, or create 30 a placeholder for it. 31 ''' 32 key = int(objnum), int(gennum) 33 result = self.indirect_objects.get(key) 34 if result is None: 35 self.indirect_objects[key] = result = PdfIndirect(key) 36 self.deferred_objects.add(key) 37 result._loader = self.loadindirect 38 return result 39 40 def readarray(self, source, PdfArray=PdfArray): 41 ''' Found a [ token. Parse the tokens after that. 42 ''' 43 specialget = self.special.get 44 result = [] 45 pop = result.pop 46 append = result.append 47 48 for value in source: 49 if value in ']R': 50 if value == ']': 51 break 52 generation = pop() 53 value = self.findindirect(pop(), generation) 54 else: 55 func = specialget(value) 56 if func is not None: 57 value = func(source) 58 append(value) 59 return PdfArray(result) 60 61 def readdict(self, source, PdfDict=PdfDict): 62 ''' Found a << token. Parse the tokens after that. 63 ''' 64 specialget = self.special.get 65 result = PdfDict() 66 next = source.next 67 68 tok = next() 69 while tok != '>>': 70 if not tok.startswith('/'): 71 source.error('Expected PDF /name object') 72 tok = next() 73 continue 74 key = tok 75 value = next() 76 func = specialget(value) 77 if func is not None: 78 value = func(source) 79 tok = next() 80 else: 81 tok = next() 82 if value.isdigit() and tok.isdigit(): 83 tok2 = next() 84 if tok2 != 'R': 85 source.error('Expected "R" following two integers') 86 tok = tok2 87 continue 88 value = self.findindirect(value, tok) 89 tok = next() 90 result[key] = value 91 return result 92 93 def empty_obj(self, source, PdfObject=PdfObject): 94 ''' Some silly git put an empty object in the 95 file. Back up so the caller sees the endobj. 96 ''' 97 source.floc = source.tokstart 98 99 def badtoken(self, source): 100 ''' Didn't see that coming. 101 ''' 102 source.exception('Unexpected delimiter') 103 104 def findstream(self, obj, tok, source, len=len): 105 ''' Figure out if there is a content stream 106 following an object, and return the start 107 pointer to the content stream if so. 108 109 (We can't read it yet, because we might not 110 know how long it is, because Length might 111 be an indirect object.) 112 ''' 113 114 fdata = source.fdata 115 startstream = source.tokstart + len(tok) 116 gotcr = fdata[startstream] == '\r' 117 startstream += gotcr 118 gotlf = fdata[startstream] == '\n' 119 startstream += gotlf 120 if not gotlf: 121 if not gotcr: 122 source.error(r'stream keyword not followed by \n') 123 else: 124 source.warning(r"stream keyword terminated " 125 r"by \r without \n") 126 return startstream 127 128 def readstream(self, obj, startstream, source, exact_required=False, 129 streamending='endstream endobj'.split(), int=int): 130 fdata = source.fdata 131 length = int(obj.Length) 132 source.floc = target_endstream = startstream + length 133 endit = source.multiple(2) 134 obj._stream = fdata[startstream:target_endstream] 135 if endit == streamending: 136 return 137 138 if exact_required: 139 source.exception('Expected endstream endobj') 140 141 # The length attribute does not match the distance between the 142 # stream and endstream keywords. 143 144 # TODO: Extract maxstream from dictionary of object offsets 145 # and use rfind instead of find. 146 maxstream = len(fdata) - 20 147 endstream = fdata.find('endstream', startstream, maxstream) 148 source.floc = startstream 149 room = endstream - startstream 150 if endstream < 0: 151 source.error('Could not find endstream') 152 return 153 if (length == room + 1 and 154 fdata[startstream - 2:startstream] == '\r\n'): 155 source.warning(r"stream keyword terminated by \r without \n") 156 obj._stream = fdata[startstream - 1:target_endstream - 1] 157 return 158 source.floc = endstream 159 if length > room: 160 source.error('stream /Length attribute (%d) appears to ' 161 'be too big (size %d) -- adjusting', 162 length, room) 163 obj.stream = fdata[startstream:endstream] 164 return 165 if fdata[target_endstream:endstream].rstrip(): 166 source.error('stream /Length attribute (%d) appears to ' 167 'be too small (size %d) -- adjusting', 168 length, room) 169 obj.stream = fdata[startstream:endstream] 170 return 171 endobj = fdata.find('endobj', endstream, maxstream) 172 if endobj < 0: 173 source.error('Could not find endobj after endstream') 174 return 175 if fdata[endstream:endobj].rstrip() != 'endstream': 176 source.error('Unexpected data between endstream and endobj') 177 return 178 source.error('Illegal endstream/endobj combination') 179 180 def loadindirect(self, key, PdfDict=PdfDict, 181 isinstance=isinstance): 182 result = self.indirect_objects.get(key) 183 if not isinstance(result, PdfIndirect): 184 return result 185 source = self.source 186 offset = int(self.source.obj_offsets.get(key, '0')) 187 if not offset: 188 source.warning("Did not find PDF object %s", key) 189 return None 190 191 # Read the object header and validate it 192 objnum, gennum = key 193 source.floc = offset 194 objid = source.multiple(3) 195 ok = len(objid) == 3 196 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum 197 ok = ok and objid[1].isdigit() and int(objid[1]) == gennum 198 ok = ok and objid[2] == 'obj' 199 if not ok: 200 source.floc = offset 201 source.next() 202 objheader = '%d %d obj' % (objnum, gennum) 203 fdata = source.fdata 204 offset2 = (fdata.find('\n' + objheader) + 1 or 205 fdata.find('\r' + objheader) + 1) 206 if (not offset2 or 207 fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): 208 source.warning("Expected indirect object '%s'", objheader) 209 return None 210 source.warning("Indirect object %s found at incorrect " 211 "offset %d (expected offset %d)", 212 objheader, offset2, offset) 213 source.floc = offset2 + len(objheader) 214 215 # Read the object, and call special code if it starts 216 # an array or dictionary 217 obj = source.next() 218 func = self.special.get(obj) 219 if func is not None: 220 obj = func(source) 221 222 self.indirect_objects[key] = obj 223 self.deferred_objects.remove(key) 224 225 # Mark the object as indirect, and 226 # just return it if it is a simple object. 227 obj.indirect = key 228 tok = source.next() 229 if tok == 'endobj': 230 return obj 231 232 # Should be a stream. Either that or it's broken. 233 isdict = isinstance(obj, PdfDict) 234 if isdict and tok == 'stream': 235 self.readstream(obj, self.findstream(obj, tok, source), source) 236 return obj 237 238 # Houston, we have a problem, but let's see if it 239 # is easily fixable. Leaving out a space before endobj 240 # is apparently an easy mistake to make on generation 241 # (Because it won't be noticed unless you are specifically 242 # generating an indirect object that doesn't end with any 243 # sort of delimiter.) It is so common that things like 244 # okular just handle it. 245 246 if isinstance(obj, PdfObject) and obj.endswith('endobj'): 247 source.error('No space or delimiter before endobj') 248 obj = PdfObject(obj[:-6]) 249 else: 250 source.error("Expected 'endobj'%s token", 251 isdict and " or 'stream'" or '') 252 obj = PdfObject('') 253 254 obj.indirect = key 255 self.indirect_objects[key] = obj 256 return obj 257 258 def read_all(self): 259 deferred = self.deferred_objects 260 prev = set() 261 while 1: 262 new = deferred - prev 263 if not new: 264 break 265 prev |= deferred 266 for key in new: 267 self.loadindirect(key) 268 269 def decrypt_all(self): 270 self.read_all() 271 272 if self.crypt_filters is not None: 273 crypt.decrypt_objects( 274 self.indirect_objects.values(), self.stream_crypt_filter, 275 self.crypt_filters) 276 277 def uncompress(self): 278 self.read_all() 279 280 uncompress(self.indirect_objects.values()) 281 282 def load_stream_objects(self, object_streams): 283 # read object streams 284 objs = [] 285 for num in object_streams: 286 obj = self.findindirect(num, 0).real_value() 287 assert obj.Type == '/ObjStm' 288 objs.append(obj) 289 290 # read objects from stream 291 if objs: 292 # Decrypt 293 if self.crypt_filters is not None: 294 crypt.decrypt_objects( 295 objs, self.stream_crypt_filter, self.crypt_filters) 296 297 # Decompress 298 uncompress(objs) 299 300 for obj in objs: 301 objsource = PdfTokens(obj.stream, 0, False) 302 next = objsource.next 303 offsets = [] 304 firstoffset = int(obj.First) 305 while objsource.floc < firstoffset: 306 offsets.append((int(next()), firstoffset + int(next()))) 307 for num, offset in offsets: 308 # Read the object, and call special code if it starts 309 # an array or dictionary 310 objsource.floc = offset 311 sobj = next() 312 func = self.special.get(sobj) 313 if func is not None: 314 sobj = func(objsource) 315 316 key = (num, 0) 317 self.indirect_objects[key] = sobj 318 if key in self.deferred_objects: 319 self.deferred_objects.remove(key) 320 321 # Mark the object as indirect, and 322 # add it to the list of streams if it starts a stream 323 sobj.indirect = key 324 325 def findxref(self, fdata): 326 ''' Find the cross reference section at the end of a file 327 ''' 328 startloc = fdata.rfind('startxref') 329 if startloc < 0: 330 raise PdfParseError('Did not find "startxref" at end of file') 331 source = PdfTokens(fdata, startloc, False, self.verbose) 332 tok = source.next() 333 assert tok == 'startxref' # (We just checked this...) 334 tableloc = source.next_default() 335 if not tableloc.isdigit(): 336 source.exception('Expected table location') 337 if source.next_default().rstrip().lstrip('%') != 'EOF': 338 source.exception('Expected %%EOF') 339 return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose) 340 341 def parse_xref_stream(self, source, int=int, range=range, 342 enumerate=enumerate, islice=itertools.islice, 343 defaultdict=collections.defaultdict, 344 hexlify=binascii.hexlify): 345 ''' Parse (one of) the cross-reference file section(s) 346 ''' 347 348 def readint(s, lengths): 349 offset = 0 350 for length in itertools.cycle(lengths): 351 next = offset + length 352 yield int(hexlify(s[offset:next]), 16) if length else None 353 offset = next 354 355 setdefault = source.obj_offsets.setdefault 356 next = source.next 357 # check for xref stream object 358 objid = source.multiple(3) 359 ok = len(objid) == 3 360 ok = ok and objid[0].isdigit() 361 ok = ok and objid[1] == 'obj' 362 ok = ok and objid[2] == '<<' 363 if not ok: 364 source.exception('Expected xref stream start') 365 obj = self.readdict(source) 366 if obj.Type != PdfName.XRef: 367 source.exception('Expected dict type of /XRef') 368 tok = next() 369 self.readstream(obj, self.findstream(obj, tok, source), source, True) 370 old_strm = obj.stream 371 if not uncompress([obj], True): 372 source.exception('Could not decompress Xref stream') 373 stream = obj.stream 374 # Fix for issue #76 -- goofy compressed xref stream 375 # that is NOT ACTUALLY COMPRESSED 376 stream = stream if stream is not old_strm else convert_store(old_strm) 377 num_pairs = obj.Index or PdfArray(['0', obj.Size]) 378 num_pairs = [int(x) for x in num_pairs] 379 num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) 380 entry_sizes = [int(x) for x in obj.W] 381 if len(entry_sizes) != 3: 382 source.exception('Invalid entry size') 383 object_streams = defaultdict(list) 384 get = readint(stream, entry_sizes) 385 for objnum, size in num_pairs: 386 for cnt in range(size): 387 xtype, p1, p2 = islice(get, 3) 388 if xtype in (1, None): 389 if p1: 390 setdefault((objnum, p2 or 0), p1) 391 elif xtype == 2: 392 object_streams[p1].append((objnum, p2)) 393 objnum += 1 394 395 obj.private.object_streams = object_streams 396 return obj 397 398 def parse_xref_table(self, source, int=int, range=range): 399 ''' Parse (one of) the cross-reference file section(s) 400 ''' 401 setdefault = source.obj_offsets.setdefault 402 next = source.next 403 # plain xref table 404 start = source.floc 405 try: 406 while 1: 407 tok = next() 408 if tok == 'trailer': 409 return 410 startobj = int(tok) 411 for objnum in range(startobj, startobj + int(next())): 412 offset = int(next()) 413 generation = int(next()) 414 inuse = next() 415 if inuse == 'n': 416 if offset != 0: 417 setdefault((objnum, generation), offset) 418 elif inuse != 'f': 419 raise ValueError 420 except: 421 pass 422 try: 423 # Table formatted incorrectly. 424 # See if we can figure it out anyway. 425 end = source.fdata.rindex('trailer', start) 426 table = source.fdata[start:end].splitlines() 427 for line in table: 428 tokens = line.split() 429 if len(tokens) == 2: 430 objnum = int(tokens[0]) 431 elif len(tokens) == 3: 432 offset, generation, inuse = (int(tokens[0]), 433 int(tokens[1]), tokens[2]) 434 if offset != 0 and inuse == 'n': 435 setdefault((objnum, generation), offset) 436 objnum += 1 437 elif tokens: 438 log.error('Invalid line in xref table: %s' % 439 repr(line)) 440 raise ValueError 441 log.warning('Badly formatted xref table') 442 source.floc = end 443 next() 444 except: 445 source.floc = start 446 source.exception('Invalid table format') 447 448 def parsexref(self, source): 449 ''' Parse (one of) the cross-reference file section(s) 450 ''' 451 next = source.next 452 try: 453 tok = next() 454 except StopIteration: 455 tok = '' 456 if tok.isdigit(): 457 return self.parse_xref_stream(source), True 458 elif tok == 'xref': 459 self.parse_xref_table(source) 460 tok = next() 461 if tok != '<<': 462 source.exception('Expected "<<" starting catalog') 463 return self.readdict(source), False 464 else: 465 source.exception('Expected "xref" keyword or xref stream object') 466 467 def readpages(self, node): 468 pagename = PdfName.Page 469 pagesname = PdfName.Pages 470 catalogname = PdfName.Catalog 471 typename = PdfName.Type 472 kidname = PdfName.Kids 473 474 try: 475 result = [] 476 stack = [node] 477 append = result.append 478 pop = stack.pop 479 while stack: 480 node = pop() 481 nodetype = node[typename] 482 if nodetype == pagename: 483 append(node) 484 elif nodetype == pagesname: 485 stack.extend(reversed(node[kidname])) 486 elif nodetype == catalogname: 487 stack.append(node[pagesname]) 488 else: 489 log.error('Expected /Page or /Pages dictionary, got %s' % 490 repr(node)) 491 return result 492 except (AttributeError, TypeError) as s: 493 log.error('Invalid page tree: %s' % s) 494 return [] 495 496 def _parse_encrypt_info(self, source, password, trailer): 497 """Check password and initialize crypt filters.""" 498 # Create and check password key 499 key = crypt.create_key(password, trailer) 500 501 if not crypt.check_user_password(key, trailer): 502 source.warning('User password does not validate') 503 504 # Create default crypt filters 505 private = self.private 506 crypt_filters = self.crypt_filters 507 version = int(trailer.Encrypt.V or 0) 508 if version in (1, 2): 509 crypt_filter = crypt.RC4CryptFilter(key) 510 private.stream_crypt_filter = crypt_filter 511 private.string_crypt_filter = crypt_filter 512 elif version == 4: 513 if PdfName.CF in trailer.Encrypt: 514 for name, params in iteritems(trailer.Encrypt.CF): 515 if name == PdfName.Identity: 516 continue 517 518 cfm = params.CFM 519 if cfm == PdfName.AESV2: 520 crypt_filters[name] = crypt.AESCryptFilter(key) 521 elif cfm == PdfName.V2: 522 crypt_filters[name] = crypt.RC4CryptFilter(key) 523 else: 524 source.warning( 525 'Unsupported crypt filter: {}, {}'.format( 526 name, cfm)) 527 528 # Read default stream filter 529 if PdfName.StmF in trailer.Encrypt: 530 name = trailer.Encrypt.StmF 531 if name in crypt_filters: 532 private.stream_crypt_filter = crypt_filters[name] 533 else: 534 source.warning( 535 'Invalid crypt filter name in /StmF:' 536 ' {}'.format(name)) 537 538 # Read default string filter 539 if PdfName.StrF in trailer.Encrypt: 540 name = trailer.Encrypt.StrF 541 if name in crypt_filters: 542 private.string_crypt_filter = crypt_filters[name] 543 else: 544 source.warning( 545 'Invalid crypt filter name in /StrF:' 546 ' {}'.format(name)) 547 else: 548 source.warning( 549 'Unsupported Encrypt version: {}'.format(version)) 550 551 def __init__(self, fname=None, fdata=None, decompress=False, 552 decrypt=False, password='', disable_gc=True, verbose=True): 553 self.private.verbose = verbose 554 555 # Runs a lot faster with GC off. 556 disable_gc = disable_gc and gc.isenabled() 557 if disable_gc: 558 gc.disable() 559 560 try: 561 if fname is not None: 562 assert fdata is None 563 # Allow reading preexisting streams like pyPdf 564 if hasattr(fname, 'read'): 565 fdata = fname.read() 566 else: 567 try: 568 f = open(fname, 'rb') 569 fdata = f.read() 570 f.close() 571 except IOError: 572 raise PdfParseError('Could not read PDF file %s' % 573 fname) 574 575 assert fdata is not None 576 fdata = convert_load(fdata) 577 578 if not fdata.startswith('%PDF-'): 579 startloc = fdata.find('%PDF-') 580 if startloc >= 0: 581 log.warning('PDF header not at beginning of file') 582 else: 583 lines = fdata.lstrip().splitlines() 584 if not lines: 585 raise PdfParseError('Empty PDF file!') 586 raise PdfParseError('Invalid PDF header: %s' % 587 repr(lines[0])) 588 589 self.private.version = fdata[5:8] 590 591 endloc = fdata.rfind('%EOF') 592 if endloc < 0: 593 raise PdfParseError('EOF mark not found: %s' % 594 repr(fdata[-20:])) 595 endloc += 6 596 junk = fdata[endloc:] 597 fdata = fdata[:endloc] 598 if junk.rstrip('\00').strip(): 599 log.warning('Extra data at end of file') 600 601 private = self.private 602 private.indirect_objects = {} 603 private.deferred_objects = set() 604 private.special = {'<<': self.readdict, 605 '[': self.readarray, 606 'endobj': self.empty_obj, 607 } 608 for tok in r'\ ( ) < > { } ] >> %'.split(): 609 self.special[tok] = self.badtoken 610 611 startloc, source = self.findxref(fdata) 612 private.source = source 613 614 # Find all the xref tables/streams, and 615 # then deal with them backwards. 616 xref_list = [] 617 while 1: 618 source.obj_offsets = {} 619 trailer, is_stream = self.parsexref(source) 620 prev = trailer.Prev 621 if prev is None: 622 token = source.next() 623 if token != 'startxref' and not xref_list: 624 source.warning('Expected "startxref" ' 625 'at end of xref table') 626 break 627 xref_list.append((source.obj_offsets, trailer, is_stream)) 628 source.floc = int(prev) 629 630 # Handle document encryption 631 private.crypt_filters = None 632 if decrypt and PdfName.Encrypt in trailer: 633 identity_filter = crypt.IdentityCryptFilter() 634 crypt_filters = { 635 PdfName.Identity: identity_filter 636 } 637 private.crypt_filters = crypt_filters 638 private.stream_crypt_filter = identity_filter 639 private.string_crypt_filter = identity_filter 640 641 if not crypt.HAS_CRYPTO: 642 raise PdfParseError( 643 'Install PyCrypto to enable encryption support') 644 645 self._parse_encrypt_info(source, password, trailer) 646 647 if is_stream: 648 self.load_stream_objects(trailer.object_streams) 649 650 while xref_list: 651 later_offsets, later_trailer, is_stream = xref_list.pop() 652 source.obj_offsets.update(later_offsets) 653 if is_stream: 654 trailer.update(later_trailer) 655 self.load_stream_objects(later_trailer.object_streams) 656 else: 657 trailer = later_trailer 658 659 trailer.Prev = None 660 661 if (trailer.Version and 662 float(trailer.Version) > float(self.version)): 663 self.private.version = trailer.Version 664 665 if decrypt: 666 self.decrypt_all() 667 trailer.Encrypt = None 668 669 if is_stream: 670 self.Root = trailer.Root 671 self.Info = trailer.Info 672 self.ID = trailer.ID 673 self.Size = trailer.Size 674 self.Encrypt = trailer.Encrypt 675 else: 676 self.update(trailer) 677 678 # self.read_all_indirect(source) 679 private.pages = self.readpages(self.Root) 680 if decompress: 681 self.uncompress() 682 683 # For compatibility with pyPdf 684 private.numPages = len(self.pages) 685 finally: 686 if disable_gc: 687 gc.enable() 688 689 # For compatibility with pyPdf 690 def getPage(self, pagenum): 691 return self.pages[pagenum] 692