1# -*- coding: utf-8 -*- 2# 3# vim: sw=4:expandtab:foldmethod=marker 4# 5# Copyright (c) 2006, Mathieu Fenniak 6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 7# 8# All rights reserved. 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright notice, 15# this list of conditions and the following disclaimer. 16# * Redistributions in binary form must reproduce the above copyright notice, 17# this list of conditions and the following disclaimer in the documentation 18# and/or other materials provided with the distribution. 19# * The name of the author may not be used to endorse or promote products 20# derived from this software without specific prior written permission. 21# 22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32# POSSIBILITY OF SUCH DAMAGE. 33 34""" 35A pure-Python PDF library with an increasing number of capabilities. 36See README for links to FAQ, documentation, homepage, etc. 37""" 38 39__author__ = "Mathieu Fenniak" 40__author_email__ = "biziqe@mathieu.fenniak.net" 41 42__maintainer__ = "Phaseit, Inc." 43__maintainer_email = "PyPDF2@phaseit.net" 44 45import string 46import math 47import struct 48import sys 49import uuid 50from sys import version_info 51if version_info < ( 3, 0 ): 52 from cStringIO import StringIO 53else: 54 from io import StringIO 55 56if version_info < ( 3, 0 ): 57 BytesIO = StringIO 58else: 59 from io import BytesIO 60 61from . import filters 62from . import utils 63import warnings 64import codecs 65from .generic import * 66from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList 67from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning 68 69if version_info < ( 2, 4 ): 70 from sets import ImmutableSet as frozenset 71 72if version_info < ( 2, 5 ): 73 from md5 import md5 74else: 75 from hashlib import md5 76import uuid 77 78 79class PdfFileWriter(object): 80 """ 81 This class supports writing PDF files out, given pages produced by another 82 class (typically :class:`PdfFileReader<PdfFileReader>`). 83 """ 84 def __init__(self): 85 self._header = b_("%PDF-1.3") 86 self._objects = [] # array of indirect objects 87 88 # The root of our page tree node. 89 pages = DictionaryObject() 90 pages.update({ 91 NameObject("/Type"): NameObject("/Pages"), 92 NameObject("/Count"): NumberObject(0), 93 NameObject("/Kids"): ArrayObject(), 94 }) 95 self._pages = self._addObject(pages) 96 97 # info object 98 info = DictionaryObject() 99 info.update({ 100 NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be')) 101 }) 102 self._info = self._addObject(info) 103 104 # root object 105 root = DictionaryObject() 106 root.update({ 107 NameObject("/Type"): NameObject("/Catalog"), 108 NameObject("/Pages"): self._pages, 109 }) 110 self._root = None 111 self._root_object = root 112 113 def _addObject(self, obj): 114 self._objects.append(obj) 115 return IndirectObject(len(self._objects), 0, self) 116 117 def getObject(self, ido): 118 if ido.pdf != self: 119 raise ValueError("pdf must be self") 120 return self._objects[ido.idnum - 1] 121 122 def _addPage(self, page, action): 123 assert page["/Type"] == "/Page" 124 page[NameObject("/Parent")] = self._pages 125 page = self._addObject(page) 126 pages = self.getObject(self._pages) 127 action(pages["/Kids"], page) 128 pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) 129 130 def addPage(self, page): 131 """ 132 Adds a page to this PDF file. The page is usually acquired from a 133 :class:`PdfFileReader<PdfFileReader>` instance. 134 135 :param PageObject page: The page to add to the document. Should be 136 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` 137 """ 138 self._addPage(page, list.append) 139 140 def insertPage(self, page, index=0): 141 """ 142 Insert a page in this PDF file. The page is usually acquired from a 143 :class:`PdfFileReader<PdfFileReader>` instance. 144 145 :param PageObject page: The page to add to the document. This 146 argument should be an instance of :class:`PageObject<pdf.PageObject>`. 147 :param int index: Position at which the page will be inserted. 148 """ 149 self._addPage(page, lambda l, p: l.insert(index, p)) 150 151 def getPage(self, pageNumber): 152 """ 153 Retrieves a page by number from this PDF file. 154 155 :param int pageNumber: The page number to retrieve 156 (pages begin at zero) 157 :return: the page at the index given by *pageNumber* 158 :rtype: :class:`PageObject<pdf.PageObject>` 159 """ 160 pages = self.getObject(self._pages) 161 # XXX: crude hack 162 return pages["/Kids"][pageNumber].getObject() 163 164 def getNumPages(self): 165 """ 166 :return: the number of pages. 167 :rtype: int 168 """ 169 pages = self.getObject(self._pages) 170 return int(pages[NameObject("/Count")]) 171 172 def addBlankPage(self, width=None, height=None): 173 """ 174 Appends a blank page to this PDF file and returns it. If no page size 175 is specified, use the size of the last page. 176 177 :param float width: The width of the new page expressed in default user 178 space units. 179 :param float height: The height of the new page expressed in default 180 user space units. 181 :return: the newly appended page 182 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` 183 :raises PageSizeNotDefinedError: if width and height are not defined 184 and previous page does not exist. 185 """ 186 page = PageObject.createBlankPage(self, width, height) 187 self.addPage(page) 188 return page 189 190 def insertBlankPage(self, width=None, height=None, index=0): 191 """ 192 Inserts a blank page to this PDF file and returns it. If no page size 193 is specified, use the size of the last page. 194 195 :param float width: The width of the new page expressed in default user 196 space units. 197 :param float height: The height of the new page expressed in default 198 user space units. 199 :param int index: Position to add the page. 200 :return: the newly appended page 201 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` 202 :raises PageSizeNotDefinedError: if width and height are not defined 203 and previous page does not exist. 204 """ 205 if width is None or height is None and \ 206 (self.getNumPages() - 1) >= index: 207 oldpage = self.getPage(index) 208 width = oldpage.mediaBox.getWidth() 209 height = oldpage.mediaBox.getHeight() 210 page = PageObject.createBlankPage(self, width, height) 211 self.insertPage(page, index) 212 return page 213 214 def addJS(self, javascript): 215 """ 216 Add Javascript which will launch upon opening this PDF. 217 218 :param str javascript: Your Javascript. 219 220 >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 221 # Example: This will launch the print window when the PDF is opened. 222 """ 223 js = DictionaryObject() 224 js.update({ 225 NameObject("/Type"): NameObject("/Action"), 226 NameObject("/S"): NameObject("/JavaScript"), 227 NameObject("/JS"): NameObject("(%s)" % javascript) 228 }) 229 js_indirect_object = self._addObject(js) 230 231 # We need a name for parameterized javascript in the pdf file, but it can be anything. 232 js_string_name = str(uuid.uuid4()) 233 234 js_name_tree = DictionaryObject() 235 js_name_tree.update({ 236 NameObject("/JavaScript"): DictionaryObject({ 237 NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object]) 238 }) 239 }) 240 self._addObject(js_name_tree) 241 242 self._root_object.update({ 243 NameObject("/OpenAction"): js_indirect_object, 244 NameObject("/Names"): js_name_tree 245 }) 246 247 def addAttachment(self, fname, fdata): 248 """ 249 Embed a file inside the PDF. 250 251 :param str fname: The filename to display. 252 :param str fdata: The data in the file. 253 254 Reference: 255 https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 256 Section 7.11.3 257 """ 258 259 # We need 3 entries: 260 # * The file's data 261 # * The /Filespec entry 262 # * The file's name, which goes in the Catalog 263 264 265 # The entry for the file 266 """ Sample: 267 8 0 obj 268 << 269 /Length 12 270 /Type /EmbeddedFile 271 >> 272 stream 273 Hello world! 274 endstream 275 endobj 276 """ 277 file_entry = DecodedStreamObject() 278 file_entry.setData(fdata) 279 file_entry.update({ 280 NameObject("/Type"): NameObject("/EmbeddedFile") 281 }) 282 283 # The Filespec entry 284 """ Sample: 285 7 0 obj 286 << 287 /Type /Filespec 288 /F (hello.txt) 289 /EF << /F 8 0 R >> 290 >> 291 """ 292 efEntry = DictionaryObject() 293 efEntry.update({ NameObject("/F"):file_entry }) 294 295 filespec = DictionaryObject() 296 filespec.update({ 297 NameObject("/Type"): NameObject("/Filespec"), 298 NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject 299 NameObject("/EF"): efEntry 300 }) 301 302 # Then create the entry for the root, as it needs a reference to the Filespec 303 """ Sample: 304 1 0 obj 305 << 306 /Type /Catalog 307 /Outlines 2 0 R 308 /Pages 3 0 R 309 /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> 310 >> 311 endobj 312 313 """ 314 embeddedFilesNamesDictionary = DictionaryObject() 315 embeddedFilesNamesDictionary.update({ 316 NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) 317 }) 318 319 embeddedFilesDictionary = DictionaryObject() 320 embeddedFilesDictionary.update({ 321 NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary 322 }) 323 # Update the root 324 self._root_object.update({ 325 NameObject("/Names"): embeddedFilesDictionary 326 }) 327 328 def appendPagesFromReader(self, reader, after_page_append=None): 329 """ 330 Copy pages from reader to writer. Includes an optional callback parameter 331 which is invoked after pages are appended to the writer. 332 333 :param reader: a PdfFileReader object from which to copy page 334 annotations to this writer object. The writer's annots 335 will then be updated 336 :callback after_page_append (function): Callback function that is invoked after 337 each page is appended to the writer. Callback signature: 338 339 :param writer_pageref (PDF page reference): Reference to the page 340 appended to the writer. 341 """ 342 # Get page count from writer and reader 343 reader_num_pages = reader.getNumPages() 344 writer_num_pages = self.getNumPages() 345 346 # Copy pages from reader to writer 347 for rpagenum in range(0, reader_num_pages): 348 reader_page = reader.getPage(rpagenum) 349 self.addPage(reader_page) 350 writer_page = self.getPage(writer_num_pages+rpagenum) 351 # Trigger callback, pass writer page as parameter 352 if callable(after_page_append): after_page_append(writer_page) 353 354 def updatePageFormFieldValues(self, page, fields): 355 ''' 356 Update the form field values for a given page from a fields dictionary. 357 Copy field texts and values from fields to page. 358 359 :param page: Page reference from PDF writer where the annotations 360 and field data will be updated. 361 :param fields: a Python dictionary of field names (/T) and text 362 values (/V) 363 ''' 364 # Iterate through pages, update field values 365 for j in range(0, len(page['/Annots'])): 366 writer_annot = page['/Annots'][j].getObject() 367 for field in fields: 368 if writer_annot.get('/T') == field: 369 writer_annot.update({ 370 NameObject("/V"): TextStringObject(fields[field]) 371 }) 372 373 def cloneReaderDocumentRoot(self, reader): 374 ''' 375 Copy the reader document root to the writer. 376 377 :param reader: PdfFileReader from the document root should be copied. 378 :callback after_page_append 379 ''' 380 self._root_object = reader.trailer['/Root'] 381 382 def cloneDocumentFromReader(self, reader, after_page_append=None): 383 ''' 384 Create a copy (clone) of a document from a PDF file reader 385 386 :param reader: PDF file reader instance from which the clone 387 should be created. 388 :callback after_page_append (function): Callback function that is invoked after 389 each page is appended to the writer. Signature includes a reference to the 390 appended page (delegates to appendPagesFromReader). Callback signature: 391 392 :param writer_pageref (PDF page reference): Reference to the page just 393 appended to the document. 394 ''' 395 self.cloneReaderDocumentRoot(reader) 396 self.appendPagesFromReader(reader, after_page_append) 397 398 def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): 399 """ 400 Encrypt this PDF file with the PDF Standard encryption handler. 401 402 :param str user_pwd: The "user password", which allows for opening 403 and reading the PDF file with the restrictions provided. 404 :param str owner_pwd: The "owner password", which allows for 405 opening the PDF files without any restrictions. By default, 406 the owner password is the same as the user password. 407 :param bool use_128bit: flag as to whether to use 128bit 408 encryption. When false, 40bit encryption will be used. By default, 409 this flag is on. 410 """ 411 import time, random 412 if owner_pwd == None: 413 owner_pwd = user_pwd 414 if use_128bit: 415 V = 2 416 rev = 3 417 keylen = int(128 / 8) 418 else: 419 V = 1 420 rev = 2 421 keylen = int(40 / 8) 422 # permit everything: 423 P = -1 424 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) 425 ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) 426 ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) 427 self._ID = ArrayObject((ID_1, ID_2)) 428 if rev == 2: 429 U, key = _alg34(user_pwd, O, P, ID_1) 430 else: 431 assert rev == 3 432 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) 433 encrypt = DictionaryObject() 434 encrypt[NameObject("/Filter")] = NameObject("/Standard") 435 encrypt[NameObject("/V")] = NumberObject(V) 436 if V == 2: 437 encrypt[NameObject("/Length")] = NumberObject(keylen * 8) 438 encrypt[NameObject("/R")] = NumberObject(rev) 439 encrypt[NameObject("/O")] = ByteStringObject(O) 440 encrypt[NameObject("/U")] = ByteStringObject(U) 441 encrypt[NameObject("/P")] = NumberObject(P) 442 self._encrypt = self._addObject(encrypt) 443 self._encrypt_key = key 444 445 def write(self, stream): 446 """ 447 Writes the collection of pages added to this object out as a PDF file. 448 449 :param stream: An object to write the file to. The object must support 450 the write method and the tell method, similar to a file object. 451 """ 452 if hasattr(stream, 'mode') and 'b' not in stream.mode: 453 warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) 454 debug = False 455 import struct 456 457 if not self._root: 458 self._root = self._addObject(self._root_object) 459 460 externalReferenceMap = {} 461 462 # PDF objects sometimes have circular references to their /Page objects 463 # inside their object tree (for example, annotations). Those will be 464 # indirect references to objects that we've recreated in this PDF. To 465 # address this problem, PageObject's store their original object 466 # reference number, and we add it to the external reference map before 467 # we sweep for indirect references. This forces self-page-referencing 468 # trees to reference the correct new object location, rather than 469 # copying in a new copy of the page object. 470 for objIndex in range(len(self._objects)): 471 obj = self._objects[objIndex] 472 if isinstance(obj, PageObject) and obj.indirectRef != None: 473 data = obj.indirectRef 474 if data.pdf not in externalReferenceMap: 475 externalReferenceMap[data.pdf] = {} 476 if data.generation not in externalReferenceMap[data.pdf]: 477 externalReferenceMap[data.pdf][data.generation] = {} 478 externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) 479 480 self.stack = [] 481 if debug: print(("ERM:", externalReferenceMap, "root:", self._root)) 482 self._sweepIndirectReferences(externalReferenceMap, self._root) 483 del self.stack 484 485 # Begin writing: 486 object_positions = [] 487 stream.write(self._header + b_("\n")) 488 for i in range(len(self._objects)): 489 idnum = (i + 1) 490 obj = self._objects[i] 491 object_positions.append(stream.tell()) 492 stream.write(b_(str(idnum) + " 0 obj\n")) 493 key = None 494 if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: 495 pack1 = struct.pack("<i", i + 1)[:3] 496 pack2 = struct.pack("<i", 0)[:2] 497 key = self._encrypt_key + pack1 + pack2 498 assert len(key) == (len(self._encrypt_key) + 5) 499 md5_hash = md5(key).digest() 500 key = md5_hash[:min(16, len(self._encrypt_key) + 5)] 501 obj.writeToStream(stream, key) 502 stream.write(b_("\nendobj\n")) 503 504 # xref table 505 xref_location = stream.tell() 506 stream.write(b_("xref\n")) 507 stream.write(b_("0 %s\n" % (len(self._objects) + 1))) 508 stream.write(b_("%010d %05d f \n" % (0, 65535))) 509 for offset in object_positions: 510 stream.write(b_("%010d %05d n \n" % (offset, 0))) 511 512 # trailer 513 stream.write(b_("trailer\n")) 514 trailer = DictionaryObject() 515 trailer.update({ 516 NameObject("/Size"): NumberObject(len(self._objects) + 1), 517 NameObject("/Root"): self._root, 518 NameObject("/Info"): self._info, 519 }) 520 if hasattr(self, "_ID"): 521 trailer[NameObject("/ID")] = self._ID 522 if hasattr(self, "_encrypt"): 523 trailer[NameObject("/Encrypt")] = self._encrypt 524 trailer.writeToStream(stream, None) 525 526 # eof 527 stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) 528 529 def addMetadata(self, infos): 530 """ 531 Add custom metadata to the output. 532 533 :param dict infos: a Python dictionary where each key is a field 534 and each value is your new metadata. 535 """ 536 args = {} 537 for key, value in list(infos.items()): 538 args[NameObject(key)] = createStringObject(value) 539 self.getObject(self._info).update(args) 540 541 def _sweepIndirectReferences(self, externMap, data): 542 debug = False 543 if debug: print((data, "TYPE", data.__class__.__name__)) 544 if isinstance(data, DictionaryObject): 545 for key, value in list(data.items()): 546 origvalue = value 547 value = self._sweepIndirectReferences(externMap, value) 548 if isinstance(value, StreamObject): 549 # a dictionary value is a stream. streams must be indirect 550 # objects, so we need to change this value. 551 value = self._addObject(value) 552 data[key] = value 553 return data 554 elif isinstance(data, ArrayObject): 555 for i in range(len(data)): 556 value = self._sweepIndirectReferences(externMap, data[i]) 557 if isinstance(value, StreamObject): 558 # an array value is a stream. streams must be indirect 559 # objects, so we need to change this value 560 value = self._addObject(value) 561 data[i] = value 562 return data 563 elif isinstance(data, IndirectObject): 564 # internal indirect references are fine 565 if data.pdf == self: 566 if data.idnum in self.stack: 567 return data 568 else: 569 self.stack.append(data.idnum) 570 realdata = self.getObject(data) 571 self._sweepIndirectReferences(externMap, realdata) 572 return data 573 else: 574 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) 575 if newobj == None: 576 try: 577 newobj = data.pdf.getObject(data) 578 self._objects.append(None) # placeholder 579 idnum = len(self._objects) 580 newobj_ido = IndirectObject(idnum, 0, self) 581 if data.pdf not in externMap: 582 externMap[data.pdf] = {} 583 if data.generation not in externMap[data.pdf]: 584 externMap[data.pdf][data.generation] = {} 585 externMap[data.pdf][data.generation][data.idnum] = newobj_ido 586 newobj = self._sweepIndirectReferences(externMap, newobj) 587 self._objects[idnum-1] = newobj 588 return newobj_ido 589 except ValueError: 590 # Unable to resolve the Object, returning NullObject instead. 591 return NullObject() 592 return newobj 593 else: 594 return data 595 596 def getReference(self, obj): 597 idnum = self._objects.index(obj) + 1 598 ref = IndirectObject(idnum, 0, self) 599 assert ref.getObject() == obj 600 return ref 601 602 def getOutlineRoot(self): 603 if '/Outlines' in self._root_object: 604 outline = self._root_object['/Outlines'] 605 idnum = self._objects.index(outline) + 1 606 outlineRef = IndirectObject(idnum, 0, self) 607 assert outlineRef.getObject() == outline 608 else: 609 outline = TreeObject() 610 outline.update({ }) 611 outlineRef = self._addObject(outline) 612 self._root_object[NameObject('/Outlines')] = outlineRef 613 614 return outline 615 616 def getNamedDestRoot(self): 617 if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject): 618 names = self._root_object['/Names'] 619 idnum = self._objects.index(names) + 1 620 namesRef = IndirectObject(idnum, 0, self) 621 assert namesRef.getObject() == names 622 if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): 623 dests = names['/Dests'] 624 idnum = self._objects.index(dests) + 1 625 destsRef = IndirectObject(idnum, 0, self) 626 assert destsRef.getObject() == dests 627 if '/Names' in dests: 628 nd = dests['/Names'] 629 else: 630 nd = ArrayObject() 631 dests[NameObject('/Names')] = nd 632 else: 633 dests = DictionaryObject() 634 destsRef = self._addObject(dests) 635 names[NameObject('/Dests')] = destsRef 636 nd = ArrayObject() 637 dests[NameObject('/Names')] = nd 638 639 else: 640 names = DictionaryObject() 641 namesRef = self._addObject(names) 642 self._root_object[NameObject('/Names')] = namesRef 643 dests = DictionaryObject() 644 destsRef = self._addObject(dests) 645 names[NameObject('/Dests')] = destsRef 646 nd = ArrayObject() 647 dests[NameObject('/Names')] = nd 648 649 return nd 650 651 def addBookmarkDestination(self, dest, parent=None): 652 destRef = self._addObject(dest) 653 654 outlineRef = self.getOutlineRoot() 655 656 if parent == None: 657 parent = outlineRef 658 659 parent = parent.getObject() 660 #print parent.__class__.__name__ 661 parent.addChild(destRef, self) 662 663 return destRef 664 665 def addBookmarkDict(self, bookmark, parent=None): 666 bookmarkObj = TreeObject() 667 for k, v in list(bookmark.items()): 668 bookmarkObj[NameObject(str(k))] = v 669 bookmarkObj.update(bookmark) 670 671 if '/A' in bookmark: 672 action = DictionaryObject() 673 for k, v in list(bookmark['/A'].items()): 674 action[NameObject(str(k))] = v 675 actionRef = self._addObject(action) 676 bookmarkObj[NameObject('/A')] = actionRef 677 678 bookmarkRef = self._addObject(bookmarkObj) 679 680 outlineRef = self.getOutlineRoot() 681 682 if parent == None: 683 parent = outlineRef 684 685 parent = parent.getObject() 686 parent.addChild(bookmarkRef, self) 687 688 return bookmarkRef 689 690 def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args): 691 """ 692 Add a bookmark to this PDF file. 693 694 :param str title: Title to use for this bookmark. 695 :param int pagenum: Page number this bookmark will point to. 696 :param parent: A reference to a parent bookmark to create nested 697 bookmarks. 698 :param tuple color: Color of the bookmark as a red, green, blue tuple 699 from 0.0 to 1.0 700 :param bool bold: Bookmark is bold 701 :param bool italic: Bookmark is italic 702 :param str fit: The fit of the destination page. See 703 :meth:`addLink()<addLink>` for details. 704 """ 705 pageRef = self.getObject(self._pages)['/Kids'][pagenum] 706 action = DictionaryObject() 707 zoomArgs = [] 708 for a in args: 709 if a is not None: 710 zoomArgs.append(NumberObject(a)) 711 else: 712 zoomArgs.append(NullObject()) 713 dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs) 714 destArray = dest.getDestArray() 715 action.update({ 716 NameObject('/D') : destArray, 717 NameObject('/S') : NameObject('/GoTo') 718 }) 719 actionRef = self._addObject(action) 720 721 outlineRef = self.getOutlineRoot() 722 723 if parent == None: 724 parent = outlineRef 725 726 bookmark = TreeObject() 727 728 bookmark.update({ 729 NameObject('/A'): actionRef, 730 NameObject('/Title'): createStringObject(title), 731 }) 732 733 if color is not None: 734 bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])}) 735 736 format = 0 737 if italic: 738 format += 1 739 if bold: 740 format += 2 741 if format: 742 bookmark.update({NameObject('/F'): NumberObject(format)}) 743 744 bookmarkRef = self._addObject(bookmark) 745 746 parent = parent.getObject() 747 parent.addChild(bookmarkRef, self) 748 749 return bookmarkRef 750 751 def addNamedDestinationObject(self, dest): 752 destRef = self._addObject(dest) 753 754 nd = self.getNamedDestRoot() 755 nd.extend([dest['/Title'], destRef]) 756 757 return destRef 758 759 def addNamedDestination(self, title, pagenum): 760 pageRef = self.getObject(self._pages)['/Kids'][pagenum] 761 dest = DictionaryObject() 762 dest.update({ 763 NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), 764 NameObject('/S') : NameObject('/GoTo') 765 }) 766 767 destRef = self._addObject(dest) 768 nd = self.getNamedDestRoot() 769 770 nd.extend([title, destRef]) 771 772 return destRef 773 774 def removeLinks(self): 775 """ 776 Removes links and annotations from this output. 777 """ 778 pages = self.getObject(self._pages)['/Kids'] 779 for page in pages: 780 pageRef = self.getObject(page) 781 if "/Annots" in pageRef: 782 del pageRef['/Annots'] 783 784 def removeImages(self, ignoreByteStringObject=False): 785 """ 786 Removes images from this output. 787 788 :param bool ignoreByteStringObject: optional parameter 789 to ignore ByteString Objects. 790 """ 791 pages = self.getObject(self._pages)['/Kids'] 792 for j in range(len(pages)): 793 page = pages[j] 794 pageRef = self.getObject(page) 795 content = pageRef['/Contents'].getObject() 796 if not isinstance(content, ContentStream): 797 content = ContentStream(content, pageRef) 798 799 _operations = [] 800 seq_graphics = False 801 for operands, operator in content.operations: 802 if operator == b_('Tj'): 803 text = operands[0] 804 if ignoreByteStringObject: 805 if not isinstance(text, TextStringObject): 806 operands[0] = TextStringObject() 807 elif operator == b_("'"): 808 text = operands[0] 809 if ignoreByteStringObject: 810 if not isinstance(text, TextStringObject): 811 operands[0] = TextStringObject() 812 elif operator == b_('"'): 813 text = operands[2] 814 if ignoreByteStringObject: 815 if not isinstance(text, TextStringObject): 816 operands[2] = TextStringObject() 817 elif operator == b_("TJ"): 818 for i in range(len(operands[0])): 819 if ignoreByteStringObject: 820 if not isinstance(operands[0][i], TextStringObject): 821 operands[0][i] = TextStringObject() 822 823 if operator == b_('q'): 824 seq_graphics = True 825 if operator == b_('Q'): 826 seq_graphics = False 827 if seq_graphics: 828 if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), 829 b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), 830 b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: 831 continue 832 if operator == b_('re'): 833 continue 834 _operations.append((operands, operator)) 835 836 content.operations = _operations 837 pageRef.__setitem__(NameObject('/Contents'), content) 838 839 def removeText(self, ignoreByteStringObject=False): 840 """ 841 Removes images from this output. 842 843 :param bool ignoreByteStringObject: optional parameter 844 to ignore ByteString Objects. 845 """ 846 pages = self.getObject(self._pages)['/Kids'] 847 for j in range(len(pages)): 848 page = pages[j] 849 pageRef = self.getObject(page) 850 content = pageRef['/Contents'].getObject() 851 if not isinstance(content, ContentStream): 852 content = ContentStream(content, pageRef) 853 for operands,operator in content.operations: 854 if operator == b_('Tj'): 855 text = operands[0] 856 if not ignoreByteStringObject: 857 if isinstance(text, TextStringObject): 858 operands[0] = TextStringObject() 859 else: 860 if isinstance(text, TextStringObject) or \ 861 isinstance(text, ByteStringObject): 862 operands[0] = TextStringObject() 863 elif operator == b_("'"): 864 text = operands[0] 865 if not ignoreByteStringObject: 866 if isinstance(text, TextStringObject): 867 operands[0] = TextStringObject() 868 else: 869 if isinstance(text, TextStringObject) or \ 870 isinstance(text, ByteStringObject): 871 operands[0] = TextStringObject() 872 elif operator == b_('"'): 873 text = operands[2] 874 if not ignoreByteStringObject: 875 if isinstance(text, TextStringObject): 876 operands[2] = TextStringObject() 877 else: 878 if isinstance(text, TextStringObject) or \ 879 isinstance(text, ByteStringObject): 880 operands[2] = TextStringObject() 881 elif operator == b_("TJ"): 882 for i in range(len(operands[0])): 883 if not ignoreByteStringObject: 884 if isinstance(operands[0][i], TextStringObject): 885 operands[0][i] = TextStringObject() 886 else: 887 if isinstance(operands[0][i], TextStringObject) or \ 888 isinstance(operands[0][i], ByteStringObject): 889 operands[0][i] = TextStringObject() 890 891 pageRef.__setitem__(NameObject('/Contents'), content) 892 893 def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): 894 """ 895 Add an internal link from a rectangular area to the specified page. 896 897 :param int pagenum: index of the page on which to place the link. 898 :param int pagedest: index of the page to which the link should go. 899 :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four 900 integers specifying the clickable rectangular area 901 ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. 902 :param border: if provided, an array describing border-drawing 903 properties. See the PDF spec for details. No border will be 904 drawn if this argument is omitted. 905 :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need 906 to be supplied. Passing ``None`` will be read as a null value for that coordinate. 907 908 Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details): 909 /Fit No additional arguments 910 /XYZ [left] [top] [zoomFactor] 911 /FitH [top] 912 /FitV [left] 913 /FitR [left] [bottom] [right] [top] 914 /FitB No additional arguments 915 /FitBH [top] 916 /FitBV [left] 917 """ 918 919 pageLink = self.getObject(self._pages)['/Kids'][pagenum] 920 pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link 921 pageRef = self.getObject(pageLink) 922 923 if border is not None: 924 borderArr = [NameObject(n) for n in border[:3]] 925 if len(border) == 4: 926 dashPattern = ArrayObject([NameObject(n) for n in border[3]]) 927 borderArr.append(dashPattern) 928 else: 929 borderArr = [NumberObject(0)] * 3 930 931 if isString(rect): 932 rect = NameObject(rect) 933 elif isinstance(rect, RectangleObject): 934 pass 935 else: 936 rect = RectangleObject(rect) 937 938 zoomArgs = [] 939 for a in args: 940 if a is not None: 941 zoomArgs.append(NumberObject(a)) 942 else: 943 zoomArgs.append(NullObject()) 944 dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link 945 destArray = dest.getDestArray() 946 947 lnk = DictionaryObject() 948 lnk.update({ 949 NameObject('/Type'): NameObject('/Annot'), 950 NameObject('/Subtype'): NameObject('/Link'), 951 NameObject('/P'): pageLink, 952 NameObject('/Rect'): rect, 953 NameObject('/Border'): ArrayObject(borderArr), 954 NameObject('/Dest'): destArray 955 }) 956 lnkRef = self._addObject(lnk) 957 958 if "/Annots" in pageRef: 959 pageRef['/Annots'].append(lnkRef) 960 else: 961 pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) 962 963 _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] 964 965 def getPageLayout(self): 966 """ 967 Get the page layout. 968 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts. 969 970 :return: Page layout currently being used. 971 :rtype: str, None if not specified 972 """ 973 try: 974 return self._root_object['/PageLayout'] 975 except KeyError: 976 return None 977 978 def setPageLayout(self, layout): 979 """ 980 Set the page layout 981 982 :param str layout: The page layout to be used 983 984 Valid layouts are: 985 /NoLayout Layout explicitly not specified 986 /SinglePage Show one page at a time 987 /OneColumn Show one column at a time 988 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left 989 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right 990 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left 991 /TwoPageRight Show two pages at a time, odd-numbered pages on the right 992 """ 993 if not isinstance(layout, NameObject): 994 if layout not in self._valid_layouts: 995 warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts))) 996 layout = NameObject(layout) 997 self._root_object.update({NameObject('/PageLayout'): layout}) 998 999 pageLayout = property(getPageLayout, setPageLayout) 1000 """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>` 1001 and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods.""" 1002 1003 _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments'] 1004 1005 def getPageMode(self): 1006 """ 1007 Get the page mode. 1008 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description 1009 of valid modes. 1010 1011 :return: Page mode currently being used. 1012 :rtype: str, None if not specified 1013 """ 1014 try: 1015 return self._root_object['/PageMode'] 1016 except KeyError: 1017 return None 1018 1019 def setPageMode(self, mode): 1020 """ 1021 Set the page mode. 1022 1023 :param str mode: The page mode to use. 1024 1025 Valid modes are: 1026 /UseNone Do not show outlines or thumbnails panels 1027 /UseOutlines Show outlines (aka bookmarks) panel 1028 /UseThumbs Show page thumbnails panel 1029 /FullScreen Fullscreen view 1030 /UseOC Show Optional Content Group (OCG) panel 1031 /UseAttachments Show attachments panel 1032 """ 1033 if not isinstance(mode, NameObject): 1034 if mode not in self._valid_modes: 1035 warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes))) 1036 mode = NameObject(mode) 1037 self._root_object.update({NameObject('/PageMode'): mode}) 1038 1039 pageMode = property(getPageMode, setPageMode) 1040 """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>` 1041 and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods.""" 1042 1043 1044class PdfFileReader(object): 1045 """ 1046 Initializes a PdfFileReader object. This operation can take some time, as 1047 the PDF stream's cross-reference tables are read into memory. 1048 1049 :param stream: A File object or an object that supports the standard read 1050 and seek methods similar to a File object. Could also be a 1051 string representing a path to a PDF file. 1052 :param bool strict: Determines whether user should be warned of all 1053 problems and also causes some correctable problems to be fatal. 1054 Defaults to ``True``. 1055 :param warndest: Destination for logging warnings (defaults to 1056 ``sys.stderr``). 1057 :param bool overwriteWarnings: Determines whether to override Python's 1058 ``warnings.py`` module with a custom implementation (defaults to 1059 ``True``). 1060 """ 1061 def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): 1062 if overwriteWarnings: 1063 # have to dynamically override the default showwarning since there are no 1064 # public methods that specify the 'file' parameter 1065 def _showwarning(message, category, filename, lineno, file=warndest, line=None): 1066 if file is None: 1067 file = sys.stderr 1068 try: 1069 file.write(formatWarning(message, category, filename, lineno, line)) 1070 except IOError: 1071 pass 1072 warnings.showwarning = _showwarning 1073 self.strict = strict 1074 self.flattenedPages = None 1075 self.resolvedObjects = {} 1076 self.xrefIndex = 0 1077 self._pageId2Num = None # map page IndirectRef number to Page Number 1078 if hasattr(stream, 'mode') and 'b' not in stream.mode: 1079 warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) 1080 if isString(stream): 1081 fileobj = open(stream, 'rb') 1082 stream = BytesIO(b_(fileobj.read())) 1083 fileobj.close() 1084 self.read(stream) 1085 self.stream = stream 1086 1087 self._override_encryption = False 1088 1089 def getDocumentInfo(self): 1090 """ 1091 Retrieves the PDF file's document information dictionary, if it exists. 1092 Note that some PDF files use metadata streams instead of docinfo 1093 dictionaries, and these metadata streams will not be accessed by this 1094 function. 1095 1096 :return: the document information of this PDF file 1097 :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists. 1098 """ 1099 if "/Info" not in self.trailer: 1100 return None 1101 obj = self.trailer['/Info'] 1102 retval = DocumentInformation() 1103 retval.update(obj) 1104 return retval 1105 1106 documentInfo = property(lambda self: self.getDocumentInfo(), None, None) 1107 """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function.""" 1108 1109 def getXmpMetadata(self): 1110 """ 1111 Retrieves XMP (Extensible Metadata Platform) data from the PDF document 1112 root. 1113 1114 :return: a :class:`XmpInformation<xmp.XmpInformation>` 1115 instance that can be used to access XMP metadata from the document. 1116 :rtype: :class:`XmpInformation<xmp.XmpInformation>` or 1117 ``None`` if no metadata was found on the document root. 1118 """ 1119 try: 1120 self._override_encryption = True 1121 return self.trailer["/Root"].getXmpMetadata() 1122 finally: 1123 self._override_encryption = False 1124 1125 xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) 1126 """ 1127 Read-only property that accesses the 1128 :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function. 1129 """ 1130 1131 def getNumPages(self): 1132 """ 1133 Calculates the number of pages in this PDF file. 1134 1135 :return: number of pages 1136 :rtype: int 1137 :raises PdfReadError: if file is encrypted and restrictions prevent 1138 this action. 1139 """ 1140 1141 # Flattened pages will not work on an Encrypted PDF; 1142 # the PDF file's page count is used in this case. Otherwise, 1143 # the original method (flattened page count) is used. 1144 if self.isEncrypted: 1145 try: 1146 self._override_encryption = True 1147 self.decrypt('') 1148 return self.trailer["/Root"]["/Pages"]["/Count"] 1149 except: 1150 raise utils.PdfReadError("File has not been decrypted") 1151 finally: 1152 self._override_encryption = False 1153 else: 1154 if self.flattenedPages == None: 1155 self._flatten() 1156 return len(self.flattenedPages) 1157 1158 numPages = property(lambda self: self.getNumPages(), None, None) 1159 """ 1160 Read-only property that accesses the 1161 :meth:`getNumPages()<PdfFileReader.getNumPages>` function. 1162 """ 1163 1164 def getPage(self, pageNumber): 1165 """ 1166 Retrieves a page by number from this PDF file. 1167 1168 :param int pageNumber: The page number to retrieve 1169 (pages begin at zero) 1170 :return: a :class:`PageObject<pdf.PageObject>` instance. 1171 :rtype: :class:`PageObject<pdf.PageObject>` 1172 """ 1173 ## ensure that we're not trying to access an encrypted PDF 1174 #assert not self.trailer.has_key("/Encrypt") 1175 if self.flattenedPages == None: 1176 self._flatten() 1177 return self.flattenedPages[pageNumber] 1178 1179 namedDestinations = property(lambda self: 1180 self.getNamedDestinations(), None, None) 1181 """ 1182 Read-only property that accesses the 1183 :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function. 1184 """ 1185 1186 # A select group of relevant field attributes. For the complete list, 1187 # see section 8.6.2 of the PDF 1.7 reference. 1188 1189 def getFields(self, tree = None, retval = None, fileobj = None): 1190 """ 1191 Extracts field data if this PDF contains interactive form fields. 1192 The *tree* and *retval* parameters are for recursive use. 1193 1194 :param fileobj: A file object (usually a text file) to write 1195 a report to on all interactive form fields found. 1196 :return: A dictionary where each key is a field name, and each 1197 value is a :class:`Field<PyPDF2.generic.Field>` object. By 1198 default, the mapping name is used for keys. 1199 :rtype: dict, or ``None`` if form data could not be located. 1200 """ 1201 fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent", 1202 "/T" : "Field Name", "/TU" : "Alternate Field Name", 1203 "/TM" : "Mapping Name", "/Ff" : "Field Flags", 1204 "/V" : "Value", "/DV" : "Default Value"} 1205 if retval == None: 1206 retval = {} 1207 catalog = self.trailer["/Root"] 1208 # get the AcroForm tree 1209 if "/AcroForm" in catalog: 1210 tree = catalog["/AcroForm"] 1211 else: 1212 return None 1213 if tree == None: 1214 return retval 1215 1216 self._checkKids(tree, retval, fileobj) 1217 for attr in fieldAttributes: 1218 if attr in tree: 1219 # Tree is a field 1220 self._buildField(tree, retval, fileobj, fieldAttributes) 1221 break 1222 1223 if "/Fields" in tree: 1224 fields = tree["/Fields"] 1225 for f in fields: 1226 field = f.getObject() 1227 self._buildField(field, retval, fileobj, fieldAttributes) 1228 1229 return retval 1230 1231 def _buildField(self, field, retval, fileobj, fieldAttributes): 1232 self._checkKids(field, retval, fileobj) 1233 try: 1234 key = field["/TM"] 1235 except KeyError: 1236 try: 1237 key = field["/T"] 1238 except KeyError: 1239 # Ignore no-name field for now 1240 return 1241 if fileobj: 1242 self._writeField(fileobj, field, fieldAttributes) 1243 fileobj.write("\n") 1244 retval[key] = Field(field) 1245 1246 def _checkKids(self, tree, retval, fileobj): 1247 if "/Kids" in tree: 1248 # recurse down the tree 1249 for kid in tree["/Kids"]: 1250 self.getFields(kid.getObject(), retval, fileobj) 1251 1252 def _writeField(self, fileobj, field, fieldAttributes): 1253 order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"] 1254 for attr in order: 1255 attrName = fieldAttributes[attr] 1256 try: 1257 if attr == "/FT": 1258 # Make the field type value more clear 1259 types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice", 1260 "/Sig":"Signature"} 1261 if field[attr] in types: 1262 fileobj.write(attrName + ": " + types[field[attr]] + "\n") 1263 elif attr == "/Parent": 1264 # Let's just write the name of the parent 1265 try: 1266 name = field["/Parent"]["/TM"] 1267 except KeyError: 1268 name = field["/Parent"]["/T"] 1269 fileobj.write(attrName + ": " + name + "\n") 1270 else: 1271 fileobj.write(attrName + ": " + str(field[attr]) + "\n") 1272 except KeyError: 1273 # Field attribute is N/A or unknown, so don't write anything 1274 pass 1275 1276 def getFormTextFields(self): 1277 ''' Retrieves form fields from the document with textual data (inputs, dropdowns) 1278 ''' 1279 # Retrieve document form fields 1280 formfields = self.getFields() 1281 return dict( 1282 (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ 1283 if formfields[field].get('/FT') == '/Tx' 1284 ) 1285 1286 def getNamedDestinations(self, tree=None, retval=None): 1287 """ 1288 Retrieves the named destinations present in the document. 1289 1290 :return: a dictionary which maps names to 1291 :class:`Destinations<PyPDF2.generic.Destination>`. 1292 :rtype: dict 1293 """ 1294 if retval == None: 1295 retval = {} 1296 catalog = self.trailer["/Root"] 1297 1298 # get the name tree 1299 if "/Dests" in catalog: 1300 tree = catalog["/Dests"] 1301 elif "/Names" in catalog: 1302 names = catalog['/Names'] 1303 if "/Dests" in names: 1304 tree = names['/Dests'] 1305 1306 if tree == None: 1307 return retval 1308 1309 if "/Kids" in tree: 1310 # recurse down the tree 1311 for kid in tree["/Kids"]: 1312 self.getNamedDestinations(kid.getObject(), retval) 1313 1314 if "/Names" in tree: 1315 names = tree["/Names"] 1316 for i in range(0, len(names), 2): 1317 key = names[i].getObject() 1318 val = names[i+1].getObject() 1319 if isinstance(val, DictionaryObject) and '/D' in val: 1320 val = val['/D'] 1321 dest = self._buildDestination(key, val) 1322 if dest != None: 1323 retval[key] = dest 1324 1325 return retval 1326 1327 outlines = property(lambda self: self.getOutlines(), None, None) 1328 """ 1329 Read-only property that accesses the 1330 :meth:`getOutlines()<PdfFileReader.getOutlines>` function. 1331 """ 1332 1333 def getOutlines(self, node=None, outlines=None): 1334 """ 1335 Retrieves the document outline present in the document. 1336 1337 :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`. 1338 """ 1339 if outlines == None: 1340 outlines = [] 1341 catalog = self.trailer["/Root"] 1342 1343 # get the outline dictionary and named destinations 1344 if "/Outlines" in catalog: 1345 try: 1346 lines = catalog["/Outlines"] 1347 except utils.PdfReadError: 1348 # this occurs if the /Outlines object reference is incorrect 1349 # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf 1350 # so continue to load the file without the Bookmarks 1351 return outlines 1352 1353 if "/First" in lines: 1354 node = lines["/First"] 1355 self._namedDests = self.getNamedDestinations() 1356 1357 if node == None: 1358 return outlines 1359 1360 # see if there are any more outlines 1361 while True: 1362 outline = self._buildOutline(node) 1363 if outline: 1364 outlines.append(outline) 1365 1366 # check for sub-outlines 1367 if "/First" in node: 1368 subOutlines = [] 1369 self.getOutlines(node["/First"], subOutlines) 1370 if subOutlines: 1371 outlines.append(subOutlines) 1372 1373 if "/Next" not in node: 1374 break 1375 node = node["/Next"] 1376 1377 return outlines 1378 1379 def _getPageNumberByIndirect(self, indirectRef): 1380 """Generate _pageId2Num""" 1381 if self._pageId2Num is None: 1382 id2num = {} 1383 for i, x in enumerate(self.pages): 1384 id2num[x.indirectRef.idnum] = i 1385 self._pageId2Num = id2num 1386 1387 if isinstance(indirectRef, int): 1388 idnum = indirectRef 1389 else: 1390 idnum = indirectRef.idnum 1391 1392 ret = self._pageId2Num.get(idnum, -1) 1393 return ret 1394 1395 def getPageNumber(self, page): 1396 """ 1397 Retrieve page number of a given PageObject 1398 1399 :param PageObject page: The page to get page number. Should be 1400 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` 1401 :return: the page number or -1 if page not found 1402 :rtype: int 1403 """ 1404 indirectRef = page.indirectRef 1405 ret = self._getPageNumberByIndirect(indirectRef) 1406 return ret 1407 1408 def getDestinationPageNumber(self, destination): 1409 """ 1410 Retrieve page number of a given Destination object 1411 1412 :param Destination destination: The destination to get page number. 1413 Should be an instance of 1414 :class:`Destination<PyPDF2.pdf.Destination>` 1415 :return: the page number or -1 if page not found 1416 :rtype: int 1417 """ 1418 indirectRef = destination.page 1419 ret = self._getPageNumberByIndirect(indirectRef) 1420 return ret 1421 1422 def _buildDestination(self, title, array): 1423 page, typ = array[0:2] 1424 array = array[2:] 1425 return Destination(title, page, typ, *array) 1426 1427 def _buildOutline(self, node): 1428 dest, title, outline = None, None, None 1429 1430 if "/A" in node and "/Title" in node: 1431 # Action, section 8.5 (only type GoTo supported) 1432 title = node["/Title"] 1433 action = node["/A"] 1434 if action["/S"] == "/GoTo": 1435 dest = action["/D"] 1436 elif "/Dest" in node and "/Title" in node: 1437 # Destination, section 8.2.1 1438 title = node["/Title"] 1439 dest = node["/Dest"] 1440 1441 # if destination found, then create outline 1442 if dest: 1443 if isinstance(dest, ArrayObject): 1444 outline = self._buildDestination(title, dest) 1445 elif isString(dest) and dest in self._namedDests: 1446 outline = self._namedDests[dest] 1447 outline[NameObject("/Title")] = title 1448 else: 1449 raise utils.PdfReadError("Unexpected destination %r" % dest) 1450 return outline 1451 1452 pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), 1453 None, None) 1454 """ 1455 Read-only property that emulates a list based upon the 1456 :meth:`getNumPages()<PdfFileReader.getNumPages>` and 1457 :meth:`getPage()<PdfFileReader.getPage>` methods. 1458 """ 1459 1460 def getPageLayout(self): 1461 """ 1462 Get the page layout. 1463 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` 1464 for a description of valid layouts. 1465 1466 :return: Page layout currently being used. 1467 :rtype: ``str``, ``None`` if not specified 1468 """ 1469 try: 1470 return self.trailer['/Root']['/PageLayout'] 1471 except KeyError: 1472 return None 1473 1474 pageLayout = property(getPageLayout) 1475 """Read-only property accessing the 1476 :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method.""" 1477 1478 def getPageMode(self): 1479 """ 1480 Get the page mode. 1481 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` 1482 for a description of valid modes. 1483 1484 :return: Page mode currently being used. 1485 :rtype: ``str``, ``None`` if not specified 1486 """ 1487 try: 1488 return self.trailer['/Root']['/PageMode'] 1489 except KeyError: 1490 return None 1491 1492 pageMode = property(getPageMode) 1493 """Read-only property accessing the 1494 :meth:`getPageMode()<PdfFileReader.getPageMode>` method.""" 1495 1496 def _flatten(self, pages=None, inherit=None, indirectRef=None): 1497 inheritablePageAttributes = ( 1498 NameObject("/Resources"), NameObject("/MediaBox"), 1499 NameObject("/CropBox"), NameObject("/Rotate") 1500 ) 1501 if inherit == None: 1502 inherit = dict() 1503 if pages == None: 1504 self.flattenedPages = [] 1505 catalog = self.trailer["/Root"].getObject() 1506 pages = catalog["/Pages"].getObject() 1507 1508 t = "/Pages" 1509 if "/Type" in pages: 1510 t = pages["/Type"] 1511 1512 if t == "/Pages": 1513 for attr in inheritablePageAttributes: 1514 if attr in pages: 1515 inherit[attr] = pages[attr] 1516 for page in pages["/Kids"]: 1517 addt = {} 1518 if isinstance(page, IndirectObject): 1519 addt["indirectRef"] = page 1520 self._flatten(page.getObject(), inherit, **addt) 1521 elif t == "/Page": 1522 for attr, value in list(inherit.items()): 1523 # if the page has it's own value, it does not inherit the 1524 # parent's value: 1525 if attr not in pages: 1526 pages[attr] = value 1527 pageObj = PageObject(self, indirectRef) 1528 pageObj.update(pages) 1529 self.flattenedPages.append(pageObj) 1530 1531 def _getObjectFromStream(self, indirectReference): 1532 # indirect reference to object in object stream 1533 # read the entire object stream into memory 1534 debug = False 1535 stmnum, idx = self.xref_objStm[indirectReference.idnum] 1536 if debug: print(("Here1: %s %s"%(stmnum, idx))) 1537 objStm = IndirectObject(stmnum, 0, self).getObject() 1538 if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData()))) 1539 # This is an xref to a stream, so its type better be a stream 1540 assert objStm['/Type'] == '/ObjStm' 1541 # /N is the number of indirect objects in the stream 1542 assert idx < objStm['/N'] 1543 streamData = BytesIO(b_(objStm.getData())) 1544 for i in range(objStm['/N']): 1545 readNonWhitespace(streamData) 1546 streamData.seek(-1, 1) 1547 objnum = NumberObject.readFromStream(streamData) 1548 readNonWhitespace(streamData) 1549 streamData.seek(-1, 1) 1550 offset = NumberObject.readFromStream(streamData) 1551 readNonWhitespace(streamData) 1552 streamData.seek(-1, 1) 1553 if objnum != indirectReference.idnum: 1554 # We're only interested in one object 1555 continue 1556 if self.strict and idx != i: 1557 raise utils.PdfReadError("Object is in wrong index.") 1558 streamData.seek(objStm['/First']+offset, 0) 1559 if debug: 1560 pos = streamData.tell() 1561 streamData.seek(0, 0) 1562 lines = streamData.readlines() 1563 for i in range(0, len(lines)): 1564 print((lines[i])) 1565 streamData.seek(pos, 0) 1566 try: 1567 obj = readObject(streamData, self) 1568 except utils.PdfStreamError as e: 1569 # Stream object cannot be read. Normally, a critical error, but 1570 # Adobe Reader doesn't complain, so continue (in strict mode?) 1571 e = sys.exc_info()[1] 1572 warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ 1573 (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) 1574 1575 if self.strict: 1576 raise utils.PdfReadError("Can't read object stream: %s"%e) 1577 # Replace with null. Hopefully it's nothing important. 1578 obj = NullObject() 1579 return obj 1580 1581 if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") 1582 return NullObject() 1583 1584 def getObject(self, indirectReference): 1585 debug = False 1586 if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) 1587 retval = self.cacheGetIndirectObject(indirectReference.generation, 1588 indirectReference.idnum) 1589 if retval != None: 1590 return retval 1591 if indirectReference.generation == 0 and \ 1592 indirectReference.idnum in self.xref_objStm: 1593 retval = self._getObjectFromStream(indirectReference) 1594 elif indirectReference.generation in self.xref and \ 1595 indirectReference.idnum in self.xref[indirectReference.generation]: 1596 start = self.xref[indirectReference.generation][indirectReference.idnum] 1597 if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start)) 1598 self.stream.seek(start, 0) 1599 idnum, generation = self.readObjectHeader(self.stream) 1600 if idnum != indirectReference.idnum and self.xrefIndex: 1601 # Xref table probably had bad indexes due to not being zero-indexed 1602 if self.strict: 1603 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ 1604 % (indirectReference.idnum, indirectReference.generation, idnum, generation)) 1605 else: pass # xref table is corrected in non-strict mode 1606 elif idnum != indirectReference.idnum: 1607 # some other problem 1608 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ 1609 % (indirectReference.idnum, indirectReference.generation, idnum, generation)) 1610 assert generation == indirectReference.generation 1611 retval = readObject(self.stream, self) 1612 1613 # override encryption is used for the /Encrypt dictionary 1614 if not self._override_encryption and self.isEncrypted: 1615 # if we don't have the encryption key: 1616 if not hasattr(self, '_decryption_key'): 1617 raise utils.PdfReadError("file has not been decrypted") 1618 # otherwise, decrypt here... 1619 import struct 1620 pack1 = struct.pack("<i", indirectReference.idnum)[:3] 1621 pack2 = struct.pack("<i", indirectReference.generation)[:2] 1622 key = self._decryption_key + pack1 + pack2 1623 assert len(key) == (len(self._decryption_key) + 5) 1624 md5_hash = md5(key).digest() 1625 key = md5_hash[:min(16, len(self._decryption_key) + 5)] 1626 retval = self._decryptObject(retval, key) 1627 else: 1628 warnings.warn("Object %d %d not defined."%(indirectReference.idnum, 1629 indirectReference.generation), utils.PdfReadWarning) 1630 #if self.strict: 1631 raise utils.PdfReadError("Could not find object.") 1632 self.cacheIndirectObject(indirectReference.generation, 1633 indirectReference.idnum, retval) 1634 return retval 1635 1636 def _decryptObject(self, obj, key): 1637 if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): 1638 obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) 1639 elif isinstance(obj, StreamObject): 1640 obj._data = utils.RC4_encrypt(key, obj._data) 1641 elif isinstance(obj, DictionaryObject): 1642 for dictkey, value in list(obj.items()): 1643 obj[dictkey] = self._decryptObject(value, key) 1644 elif isinstance(obj, ArrayObject): 1645 for i in range(len(obj)): 1646 obj[i] = self._decryptObject(obj[i], key) 1647 return obj 1648 1649 def readObjectHeader(self, stream): 1650 # Should never be necessary to read out whitespace, since the 1651 # cross-reference table should put us in the right spot to read the 1652 # object header. In reality... some files have stupid cross reference 1653 # tables that are off by whitespace bytes. 1654 extra = False 1655 utils.skipOverComment(stream) 1656 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) 1657 idnum = readUntilWhitespace(stream) 1658 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) 1659 generation = readUntilWhitespace(stream) 1660 obj = stream.read(3) 1661 readNonWhitespace(stream) 1662 stream.seek(-1, 1) 1663 if (extra and self.strict): 1664 #not a fatal error 1665 warnings.warn("Superfluous whitespace found in object header %s %s" % \ 1666 (idnum, generation), utils.PdfReadWarning) 1667 return int(idnum), int(generation) 1668 1669 def cacheGetIndirectObject(self, generation, idnum): 1670 debug = False 1671 out = self.resolvedObjects.get((generation, idnum)) 1672 if debug and out: print(("cache hit: %d %d"%(idnum, generation))) 1673 elif debug: print(("cache miss: %d %d"%(idnum, generation))) 1674 return out 1675 1676 def cacheIndirectObject(self, generation, idnum, obj): 1677 # return None # Sometimes we want to turn off cache for debugging. 1678 if (generation, idnum) in self.resolvedObjects: 1679 msg = "Overwriting cache for %s %s"%(generation, idnum) 1680 if self.strict: raise utils.PdfReadError(msg) 1681 else: warnings.warn(msg) 1682 self.resolvedObjects[(generation, idnum)] = obj 1683 return obj 1684 1685 def read(self, stream): 1686 debug = False 1687 if debug: print(">>read", stream) 1688 # start at the end: 1689 stream.seek(-1, 2) 1690 if not stream.tell(): 1691 raise utils.PdfReadError('Cannot read an empty file') 1692 last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream 1693 line = b_('') 1694 while line[:5] != b_("%%EOF"): 1695 if stream.tell() < last1K: 1696 raise utils.PdfReadError("EOF marker not found") 1697 line = self.readNextEndLine(stream) 1698 if debug: print(" line:",line) 1699 1700 # find startxref entry - the location of the xref table 1701 line = self.readNextEndLine(stream) 1702 try: 1703 startxref = int(line) 1704 except ValueError: 1705 # 'startxref' may be on the same line as the location 1706 if not line.startswith(b_("startxref")): 1707 raise utils.PdfReadError("startxref not found") 1708 startxref = int(line[9:].strip()) 1709 warnings.warn("startxref on same line as offset") 1710 else: 1711 line = self.readNextEndLine(stream) 1712 if line[:9] != b_("startxref"): 1713 raise utils.PdfReadError("startxref not found") 1714 1715 # read all cross reference tables and their trailers 1716 self.xref = {} 1717 self.xref_objStm = {} 1718 self.trailer = DictionaryObject() 1719 while True: 1720 # load the xref table 1721 stream.seek(startxref, 0) 1722 x = stream.read(1) 1723 if x == b_("x"): 1724 # standard cross-reference table 1725 ref = stream.read(4) 1726 if ref[:3] != b_("ref"): 1727 raise utils.PdfReadError("xref table read error") 1728 readNonWhitespace(stream) 1729 stream.seek(-1, 1) 1730 firsttime = True; # check if the first time looking at the xref table 1731 while True: 1732 num = readObject(stream, self) 1733 if firsttime and num != 0: 1734 self.xrefIndex = num 1735 if self.strict: 1736 warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning) 1737 #if table not zero indexed, could be due to error from when PDF was created 1738 #which will lead to mismatched indices later on, only warned and corrected if self.strict=True 1739 firsttime = False 1740 readNonWhitespace(stream) 1741 stream.seek(-1, 1) 1742 size = readObject(stream, self) 1743 readNonWhitespace(stream) 1744 stream.seek(-1, 1) 1745 cnt = 0 1746 while cnt < size: 1747 line = stream.read(20) 1748 1749 # It's very clear in section 3.4.3 of the PDF spec 1750 # that all cross-reference table lines are a fixed 1751 # 20 bytes (as of PDF 1.7). However, some files have 1752 # 21-byte entries (or more) due to the use of \r\n 1753 # (CRLF) EOL's. Detect that case, and adjust the line 1754 # until it does not begin with a \r (CR) or \n (LF). 1755 while line[0] in b_("\x0D\x0A"): 1756 stream.seek(-20 + 1, 1) 1757 line = stream.read(20) 1758 1759 # On the other hand, some malformed PDF files 1760 # use a single character EOL without a preceeding 1761 # space. Detect that case, and seek the stream 1762 # back one character. (0-9 means we've bled into 1763 # the next xref entry, t means we've bled into the 1764 # text "trailer"): 1765 if line[-1] in b_("0123456789t"): 1766 stream.seek(-1, 1) 1767 1768 offset, generation = line[:16].split(b_(" ")) 1769 offset, generation = int(offset), int(generation) 1770 if generation not in self.xref: 1771 self.xref[generation] = {} 1772 if num in self.xref[generation]: 1773 # It really seems like we should allow the last 1774 # xref table in the file to override previous 1775 # ones. Since we read the file backwards, assume 1776 # any existing key is already set correctly. 1777 pass 1778 else: 1779 self.xref[generation][num] = offset 1780 cnt += 1 1781 num += 1 1782 readNonWhitespace(stream) 1783 stream.seek(-1, 1) 1784 trailertag = stream.read(7) 1785 if trailertag != b_("trailer"): 1786 # more xrefs! 1787 stream.seek(-7, 1) 1788 else: 1789 break 1790 readNonWhitespace(stream) 1791 stream.seek(-1, 1) 1792 newTrailer = readObject(stream, self) 1793 for key, value in list(newTrailer.items()): 1794 if key not in self.trailer: 1795 self.trailer[key] = value 1796 if "/Prev" in newTrailer: 1797 startxref = newTrailer["/Prev"] 1798 else: 1799 break 1800 elif x.isdigit(): 1801 # PDF 1.5+ Cross-Reference Stream 1802 stream.seek(-1, 1) 1803 idnum, generation = self.readObjectHeader(stream) 1804 xrefstream = readObject(stream, self) 1805 assert xrefstream["/Type"] == "/XRef" 1806 self.cacheIndirectObject(generation, idnum, xrefstream) 1807 streamData = BytesIO(b_(xrefstream.getData())) 1808 # Index pairs specify the subsections in the dictionary. If 1809 # none create one subsection that spans everything. 1810 idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) 1811 if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) 1812 entrySizes = xrefstream.get("/W") 1813 assert len(entrySizes) >= 3 1814 if self.strict and len(entrySizes) > 3: 1815 raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) 1816 1817 def getEntry(i): 1818 # Reads the correct number of bytes for each entry. See the 1819 # discussion of the W parameter in PDF spec table 17. 1820 if entrySizes[i] > 0: 1821 d = streamData.read(entrySizes[i]) 1822 return convertToInt(d, entrySizes[i]) 1823 1824 # PDF Spec Table 17: A value of zero for an element in the 1825 # W array indicates...the default value shall be used 1826 if i == 0: return 1 # First value defaults to 1 1827 else: return 0 1828 1829 def used_before(num, generation): 1830 # We move backwards through the xrefs, don't replace any. 1831 return num in self.xref.get(generation, []) or \ 1832 num in self.xref_objStm 1833 1834 # Iterate through each subsection 1835 last_end = 0 1836 for start, size in self._pairs(idx_pairs): 1837 # The subsections must increase 1838 assert start >= last_end 1839 last_end = start + size 1840 for num in range(start, start+size): 1841 # The first entry is the type 1842 xref_type = getEntry(0) 1843 # The rest of the elements depend on the xref_type 1844 if xref_type == 0: 1845 # linked list of free objects 1846 next_free_object = getEntry(1) 1847 next_generation = getEntry(2) 1848 elif xref_type == 1: 1849 # objects that are in use but are not compressed 1850 byte_offset = getEntry(1) 1851 generation = getEntry(2) 1852 if generation not in self.xref: 1853 self.xref[generation] = {} 1854 if not used_before(num, generation): 1855 self.xref[generation][num] = byte_offset 1856 if debug: print(("XREF Uncompressed: %s %s"%( 1857 num, generation))) 1858 elif xref_type == 2: 1859 # compressed objects 1860 objstr_num = getEntry(1) 1861 obstr_idx = getEntry(2) 1862 generation = 0 # PDF spec table 18, generation is 0 1863 if not used_before(num, generation): 1864 if debug: print(("XREF Compressed: %s %s %s"%( 1865 num, objstr_num, obstr_idx))) 1866 self.xref_objStm[num] = (objstr_num, obstr_idx) 1867 elif self.strict: 1868 raise utils.PdfReadError("Unknown xref type: %s"% 1869 xref_type) 1870 1871 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" 1872 for key in trailerKeys: 1873 if key in xrefstream and key not in self.trailer: 1874 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 1875 if "/Prev" in xrefstream: 1876 startxref = xrefstream["/Prev"] 1877 else: 1878 break 1879 else: 1880 # bad xref character at startxref. Let's see if we can find 1881 # the xref table nearby, as we've observed this error with an 1882 # off-by-one before. 1883 stream.seek(-11, 1) 1884 tmp = stream.read(20) 1885 xref_loc = tmp.find(b_("xref")) 1886 if xref_loc != -1: 1887 startxref -= (10 - xref_loc) 1888 continue 1889 # No explicit xref table, try finding a cross-reference stream. 1890 stream.seek(startxref, 0) 1891 found = False 1892 for look in range(5): 1893 if stream.read(1).isdigit(): 1894 # This is not a standard PDF, consider adding a warning 1895 startxref += look 1896 found = True 1897 break 1898 if found: 1899 continue 1900 # no xref table found at specified location 1901 raise utils.PdfReadError("Could not find xref table at specified location") 1902 #if not zero-indexed, verify that the table is correct; change it if necessary 1903 if self.xrefIndex and not self.strict: 1904 loc = stream.tell() 1905 for gen in self.xref: 1906 if gen == 65535: continue 1907 for id in self.xref[gen]: 1908 stream.seek(self.xref[gen][id], 0) 1909 try: 1910 pid, pgen = self.readObjectHeader(stream) 1911 except ValueError: 1912 break 1913 if pid == id - self.xrefIndex: 1914 self._zeroXref(gen) 1915 break 1916 #if not, then either it's just plain wrong, or the non-zero-index is actually correct 1917 stream.seek(loc, 0) #return to where it was 1918 1919 def _zeroXref(self, generation): 1920 self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) 1921 1922 def _pairs(self, array): 1923 i = 0 1924 while True: 1925 yield array[i], array[i+1] 1926 i += 2 1927 if (i+1) >= len(array): 1928 break 1929 1930 def readNextEndLine(self, stream): 1931 debug = False 1932 if debug: print(">>readNextEndLine") 1933 line = b_("") 1934 while True: 1935 # Prevent infinite loops in malformed PDFs 1936 if stream.tell() == 0: 1937 raise utils.PdfReadError("Could not read malformed PDF file") 1938 x = stream.read(1) 1939 if debug: print((" x:", x, "%x"%ord(x))) 1940 if stream.tell() < 2: 1941 raise utils.PdfReadError("EOL marker not found") 1942 stream.seek(-2, 1) 1943 if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR 1944 crlf = False 1945 while x == b_('\n') or x == b_('\r'): 1946 if debug: 1947 if ord(x) == 0x0D: print(" x is CR 0D") 1948 elif ord(x) == 0x0A: print(" x is LF 0A") 1949 x = stream.read(1) 1950 if x == b_('\n') or x == b_('\r'): # account for CR+LF 1951 stream.seek(-1, 1) 1952 crlf = True 1953 if stream.tell() < 2: 1954 raise utils.PdfReadError("EOL marker not found") 1955 stream.seek(-2, 1) 1956 stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 1957 break 1958 else: 1959 if debug: print(" x is neither") 1960 line = x + line 1961 if debug: print((" RNEL line:", line)) 1962 if debug: print("leaving RNEL") 1963 return line 1964 1965 def decrypt(self, password): 1966 """ 1967 When using an encrypted / secured PDF file with the PDF Standard 1968 encryption handler, this function will allow the file to be decrypted. 1969 It checks the given password against the document's user password and 1970 owner password, and then stores the resulting decryption key if either 1971 password is correct. 1972 1973 It does not matter which password was matched. Both passwords provide 1974 the correct decryption key that will allow the document to be used with 1975 this library. 1976 1977 :param str password: The password to match. 1978 :return: ``0`` if the password failed, ``1`` if the password matched the user 1979 password, and ``2`` if the password matched the owner password. 1980 :rtype: int 1981 :raises NotImplementedError: if document uses an unsupported encryption 1982 method. 1983 """ 1984 1985 self._override_encryption = True 1986 try: 1987 return self._decrypt(password) 1988 finally: 1989 self._override_encryption = False 1990 1991 def _decrypt(self, password): 1992 encrypt = self.trailer['/Encrypt'].getObject() 1993 if encrypt['/Filter'] != '/Standard': 1994 raise NotImplementedError("only Standard PDF encryption handler is available") 1995 if not (encrypt['/V'] in (1, 2)): 1996 raise NotImplementedError("only algorithm code 1 and 2 are supported") 1997 user_password, key = self._authenticateUserPassword(password) 1998 if user_password: 1999 self._decryption_key = key 2000 return 1 2001 else: 2002 rev = encrypt['/R'].getObject() 2003 if rev == 2: 2004 keylen = 5 2005 else: 2006 keylen = encrypt['/Length'].getObject() // 8 2007 key = _alg33_1(password, rev, keylen) 2008 real_O = encrypt["/O"].getObject() 2009 if rev == 2: 2010 userpass = utils.RC4_encrypt(key, real_O) 2011 else: 2012 val = real_O 2013 for i in range(19, -1, -1): 2014 new_key = b_('') 2015 for l in range(len(key)): 2016 new_key += b_(chr(utils.ord_(key[l]) ^ i)) 2017 val = utils.RC4_encrypt(new_key, val) 2018 userpass = val 2019 owner_password, key = self._authenticateUserPassword(userpass) 2020 if owner_password: 2021 self._decryption_key = key 2022 return 2 2023 return 0 2024 2025 def _authenticateUserPassword(self, password): 2026 encrypt = self.trailer['/Encrypt'].getObject() 2027 rev = encrypt['/R'].getObject() 2028 owner_entry = encrypt['/O'].getObject() 2029 p_entry = encrypt['/P'].getObject() 2030 id_entry = self.trailer['/ID'].getObject() 2031 id1_entry = id_entry[0].getObject() 2032 real_U = encrypt['/U'].getObject().original_bytes 2033 if rev == 2: 2034 U, key = _alg34(password, owner_entry, p_entry, id1_entry) 2035 elif rev >= 3: 2036 U, key = _alg35(password, rev, 2037 encrypt["/Length"].getObject() // 8, owner_entry, 2038 p_entry, id1_entry, 2039 encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) 2040 U, real_U = U[:16], real_U[:16] 2041 return U == real_U, key 2042 2043 def getIsEncrypted(self): 2044 return "/Encrypt" in self.trailer 2045 2046 isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) 2047 """ 2048 Read-only boolean property showing whether this PDF file is encrypted. 2049 Note that this property, if true, will remain true even after the 2050 :meth:`decrypt()<PdfFileReader.decrypt>` method is called. 2051 """ 2052 2053 2054def getRectangle(self, name, defaults): 2055 retval = self.get(name) 2056 if isinstance(retval, RectangleObject): 2057 return retval 2058 if retval == None: 2059 for d in defaults: 2060 retval = self.get(d) 2061 if retval != None: 2062 break 2063 if isinstance(retval, IndirectObject): 2064 retval = self.pdf.getObject(retval) 2065 retval = RectangleObject(retval) 2066 setRectangle(self, name, retval) 2067 return retval 2068 2069 2070def setRectangle(self, name, value): 2071 if not isinstance(name, NameObject): 2072 name = NameObject(name) 2073 self[name] = value 2074 2075 2076def deleteRectangle(self, name): 2077 del self[name] 2078 2079 2080def createRectangleAccessor(name, fallback): 2081 return \ 2082 property( 2083 lambda self: getRectangle(self, name, fallback), 2084 lambda self, value: setRectangle(self, name, value), 2085 lambda self: deleteRectangle(self, name) 2086 ) 2087 2088 2089class PageObject(DictionaryObject): 2090 """ 2091 This class represents a single page within a PDF file. Typically this 2092 object will be created by accessing the 2093 :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the 2094 :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is 2095 also possible to create an empty page with the 2096 :meth:`createBlankPage()<PageObject.createBlankPage>` static method. 2097 2098 :param pdf: PDF file the page belongs to. 2099 :param indirectRef: Stores the original indirect reference to 2100 this object in its source PDF 2101 """ 2102 def __init__(self, pdf=None, indirectRef=None): 2103 DictionaryObject.__init__(self) 2104 self.pdf = pdf 2105 self.indirectRef = indirectRef 2106 2107 def createBlankPage(pdf=None, width=None, height=None): 2108 """ 2109 Returns a new blank page. 2110 If ``width`` or ``height`` is ``None``, try to get the page size 2111 from the last page of *pdf*. 2112 2113 :param pdf: PDF file the page belongs to 2114 :param float width: The width of the new page expressed in default user 2115 space units. 2116 :param float height: The height of the new page expressed in default user 2117 space units. 2118 :return: the new blank page: 2119 :rtype: :class:`PageObject<PageObject>` 2120 :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 2121 no page 2122 """ 2123 page = PageObject(pdf) 2124 2125 # Creates a new page (cf PDF Reference 7.7.3.3) 2126 page.__setitem__(NameObject('/Type'), NameObject('/Page')) 2127 page.__setitem__(NameObject('/Parent'), NullObject()) 2128 page.__setitem__(NameObject('/Resources'), DictionaryObject()) 2129 if width is None or height is None: 2130 if pdf is not None and pdf.getNumPages() > 0: 2131 lastpage = pdf.getPage(pdf.getNumPages() - 1) 2132 width = lastpage.mediaBox.getWidth() 2133 height = lastpage.mediaBox.getHeight() 2134 else: 2135 raise utils.PageSizeNotDefinedError() 2136 page.__setitem__(NameObject('/MediaBox'), 2137 RectangleObject([0, 0, width, height])) 2138 2139 return page 2140 createBlankPage = staticmethod(createBlankPage) 2141 2142 def rotateClockwise(self, angle): 2143 """ 2144 Rotates a page clockwise by increments of 90 degrees. 2145 2146 :param int angle: Angle to rotate the page. Must be an increment 2147 of 90 deg. 2148 """ 2149 assert angle % 90 == 0 2150 self._rotate(angle) 2151 return self 2152 2153 def rotateCounterClockwise(self, angle): 2154 """ 2155 Rotates a page counter-clockwise by increments of 90 degrees. 2156 2157 :param int angle: Angle to rotate the page. Must be an increment 2158 of 90 deg. 2159 """ 2160 assert angle % 90 == 0 2161 self._rotate(-angle) 2162 return self 2163 2164 def _rotate(self, angle): 2165 currentAngle = self.get("/Rotate", 0) 2166 self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) 2167 2168 def _mergeResources(res1, res2, resource): 2169 newRes = DictionaryObject() 2170 newRes.update(res1.get(resource, DictionaryObject()).getObject()) 2171 page2Res = res2.get(resource, DictionaryObject()).getObject() 2172 renameRes = {} 2173 for key in list(page2Res.keys()): 2174 if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key): 2175 newname = NameObject(key + str(uuid.uuid4())) 2176 renameRes[key] = newname 2177 newRes[newname] = page2Res[key] 2178 elif key not in newRes: 2179 newRes[key] = page2Res.raw_get(key) 2180 return newRes, renameRes 2181 _mergeResources = staticmethod(_mergeResources) 2182 2183 def _contentStreamRename(stream, rename, pdf): 2184 if not rename: 2185 return stream 2186 stream = ContentStream(stream, pdf) 2187 for operands, operator in stream.operations: 2188 for i in range(len(operands)): 2189 op = operands[i] 2190 if isinstance(op, NameObject): 2191 operands[i] = rename.get(op,op) 2192 return stream 2193 _contentStreamRename = staticmethod(_contentStreamRename) 2194 2195 def _pushPopGS(contents, pdf): 2196 # adds a graphics state "push" and "pop" to the beginning and end 2197 # of a content stream. This isolates it from changes such as 2198 # transformation matricies. 2199 stream = ContentStream(contents, pdf) 2200 stream.operations.insert(0, [[], "q"]) 2201 stream.operations.append([[], "Q"]) 2202 return stream 2203 _pushPopGS = staticmethod(_pushPopGS) 2204 2205 def _addTransformationMatrix(contents, pdf, ctm): 2206 # adds transformation matrix at the beginning of the given 2207 # contents stream. 2208 a, b, c, d, e, f = ctm 2209 contents = ContentStream(contents, pdf) 2210 contents.operations.insert(0, [[FloatObject(a), FloatObject(b), 2211 FloatObject(c), FloatObject(d), FloatObject(e), 2212 FloatObject(f)], " cm"]) 2213 return contents 2214 _addTransformationMatrix = staticmethod(_addTransformationMatrix) 2215 2216 def getContents(self): 2217 """ 2218 Accesses the page contents. 2219 2220 :return: the ``/Contents`` object, or ``None`` if it doesn't exist. 2221 ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 2222 """ 2223 if "/Contents" in self: 2224 return self["/Contents"].getObject() 2225 else: 2226 return None 2227 2228 def mergePage(self, page2): 2229 """ 2230 Merges the content streams of two pages into one. Resource references 2231 (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc 2232 of this page are not altered. The parameter page's content stream will 2233 be added to the end of this page's content stream, meaning that it will 2234 be drawn after, or "on top" of this page. 2235 2236 :param PageObject page2: The page to be merged into this one. Should be 2237 an instance of :class:`PageObject<PageObject>`. 2238 """ 2239 self._mergePage(page2) 2240 2241 def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): 2242 # First we work on merging the resource dictionaries. This allows us 2243 # to find out what symbols in the content streams we might need to 2244 # rename. 2245 2246 newResources = DictionaryObject() 2247 rename = {} 2248 originalResources = self["/Resources"].getObject() 2249 page2Resources = page2["/Resources"].getObject() 2250 newAnnots = ArrayObject() 2251 2252 for page in (self, page2): 2253 if "/Annots" in page: 2254 annots = page["/Annots"] 2255 if isinstance(annots, ArrayObject): 2256 for ref in annots: 2257 newAnnots.append(ref) 2258 2259 for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": 2260 new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) 2261 if new: 2262 newResources[NameObject(res)] = new 2263 rename.update(newrename) 2264 2265 # Combine /ProcSet sets. 2266 newResources[NameObject("/ProcSet")] = ArrayObject( 2267 frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( 2268 frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) 2269 ) 2270 ) 2271 2272 newContentArray = ArrayObject() 2273 2274 originalContent = self.getContents() 2275 if originalContent is not None: 2276 newContentArray.append(PageObject._pushPopGS( 2277 originalContent, self.pdf)) 2278 2279 page2Content = page2.getContents() 2280 if page2Content is not None: 2281 if page2transformation is not None: 2282 page2Content = page2transformation(page2Content) 2283 page2Content = PageObject._contentStreamRename( 2284 page2Content, rename, self.pdf) 2285 page2Content = PageObject._pushPopGS(page2Content, self.pdf) 2286 newContentArray.append(page2Content) 2287 2288 # if expanding the page to fit a new page, calculate the new media box size 2289 if expand: 2290 corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), 2291 self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] 2292 corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), 2293 page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), 2294 page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), 2295 page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] 2296 if ctm is not None: 2297 ctm = [float(x) for x in ctm] 2298 new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)] 2299 new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)] 2300 else: 2301 new_x = corners2[0:8:2] 2302 new_y = corners2[1:8:2] 2303 lowerleft = [min(new_x), min(new_y)] 2304 upperright = [max(new_x), max(new_y)] 2305 lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] 2306 upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])] 2307 2308 self.mediaBox.setLowerLeft(lowerleft) 2309 self.mediaBox.setUpperRight(upperright) 2310 2311 self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) 2312 self[NameObject('/Resources')] = newResources 2313 self[NameObject('/Annots')] = newAnnots 2314 2315 def mergeTransformedPage(self, page2, ctm, expand=False): 2316 """ 2317 This is similar to mergePage, but a transformation matrix is 2318 applied to the merged stream. 2319 2320 :param PageObject page2: The page to be merged into this one. Should be 2321 an instance of :class:`PageObject<PageObject>`. 2322 :param tuple ctm: a 6-element tuple containing the operands of the 2323 transformation matrix 2324 :param bool expand: Whether the page should be expanded to fit the dimensions 2325 of the page to be merged. 2326 """ 2327 self._mergePage(page2, lambda page2Content: 2328 PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand) 2329 2330 def mergeScaledPage(self, page2, scale, expand=False): 2331 """ 2332 This is similar to mergePage, but the stream to be merged is scaled 2333 by appling a transformation matrix. 2334 2335 :param PageObject page2: The page to be merged into this one. Should be 2336 an instance of :class:`PageObject<PageObject>`. 2337 :param float scale: The scaling factor 2338 :param bool expand: Whether the page should be expanded to fit the 2339 dimensions of the page to be merged. 2340 """ 2341 # CTM to scale : [ sx 0 0 sy 0 0 ] 2342 return self.mergeTransformedPage(page2, [scale, 0, 2343 0, scale, 2344 0, 0], expand) 2345 2346 def mergeRotatedPage(self, page2, rotation, expand=False): 2347 """ 2348 This is similar to mergePage, but the stream to be merged is rotated 2349 by appling a transformation matrix. 2350 2351 :param PageObject page2: the page to be merged into this one. Should be 2352 an instance of :class:`PageObject<PageObject>`. 2353 :param float rotation: The angle of the rotation, in degrees 2354 :param bool expand: Whether the page should be expanded to fit the 2355 dimensions of the page to be merged. 2356 """ 2357 rotation = math.radians(rotation) 2358 return self.mergeTransformedPage(page2, 2359 [math.cos(rotation), math.sin(rotation), 2360 -math.sin(rotation), math.cos(rotation), 2361 0, 0], expand) 2362 2363 def mergeTranslatedPage(self, page2, tx, ty, expand=False): 2364 """ 2365 This is similar to mergePage, but the stream to be merged is translated 2366 by appling a transformation matrix. 2367 2368 :param PageObject page2: the page to be merged into this one. Should be 2369 an instance of :class:`PageObject<PageObject>`. 2370 :param float tx: The translation on X axis 2371 :param float ty: The translation on Y axis 2372 :param bool expand: Whether the page should be expanded to fit the 2373 dimensions of the page to be merged. 2374 """ 2375 return self.mergeTransformedPage(page2, [1, 0, 2376 0, 1, 2377 tx, ty], expand) 2378 2379 def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): 2380 """ 2381 This is similar to mergePage, but the stream to be merged is rotated 2382 and translated by appling a transformation matrix. 2383 2384 :param PageObject page2: the page to be merged into this one. Should be 2385 an instance of :class:`PageObject<PageObject>`. 2386 :param float tx: The translation on X axis 2387 :param float ty: The translation on Y axis 2388 :param float rotation: The angle of the rotation, in degrees 2389 :param bool expand: Whether the page should be expanded to fit the 2390 dimensions of the page to be merged. 2391 """ 2392 2393 translation = [[1, 0, 0], 2394 [0, 1, 0], 2395 [-tx, -ty, 1]] 2396 rotation = math.radians(rotation) 2397 rotating = [[math.cos(rotation), math.sin(rotation), 0], 2398 [-math.sin(rotation), math.cos(rotation), 0], 2399 [0, 0, 1]] 2400 rtranslation = [[1, 0, 0], 2401 [0, 1, 0], 2402 [tx, ty, 1]] 2403 ctm = utils.matrixMultiply(translation, rotating) 2404 ctm = utils.matrixMultiply(ctm, rtranslation) 2405 2406 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 2407 ctm[1][0], ctm[1][1], 2408 ctm[2][0], ctm[2][1]], expand) 2409 2410 def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): 2411 """ 2412 This is similar to mergePage, but the stream to be merged is rotated 2413 and scaled by appling a transformation matrix. 2414 2415 :param PageObject page2: the page to be merged into this one. Should be 2416 an instance of :class:`PageObject<PageObject>`. 2417 :param float rotation: The angle of the rotation, in degrees 2418 :param float scale: The scaling factor 2419 :param bool expand: Whether the page should be expanded to fit the 2420 dimensions of the page to be merged. 2421 """ 2422 rotation = math.radians(rotation) 2423 rotating = [[math.cos(rotation), math.sin(rotation), 0], 2424 [-math.sin(rotation), math.cos(rotation), 0], 2425 [0, 0, 1]] 2426 scaling = [[scale, 0, 0], 2427 [0, scale, 0], 2428 [0, 0, 1]] 2429 ctm = utils.matrixMultiply(rotating, scaling) 2430 2431 return self.mergeTransformedPage(page2, 2432 [ctm[0][0], ctm[0][1], 2433 ctm[1][0], ctm[1][1], 2434 ctm[2][0], ctm[2][1]], expand) 2435 2436 def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): 2437 """ 2438 This is similar to mergePage, but the stream to be merged is translated 2439 and scaled by appling a transformation matrix. 2440 2441 :param PageObject page2: the page to be merged into this one. Should be 2442 an instance of :class:`PageObject<PageObject>`. 2443 :param float scale: The scaling factor 2444 :param float tx: The translation on X axis 2445 :param float ty: The translation on Y axis 2446 :param bool expand: Whether the page should be expanded to fit the 2447 dimensions of the page to be merged. 2448 """ 2449 2450 translation = [[1, 0, 0], 2451 [0, 1, 0], 2452 [tx, ty, 1]] 2453 scaling = [[scale, 0, 0], 2454 [0, scale, 0], 2455 [0, 0, 1]] 2456 ctm = utils.matrixMultiply(scaling, translation) 2457 2458 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 2459 ctm[1][0], ctm[1][1], 2460 ctm[2][0], ctm[2][1]], expand) 2461 2462 def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): 2463 """ 2464 This is similar to mergePage, but the stream to be merged is translated, 2465 rotated and scaled by appling a transformation matrix. 2466 2467 :param PageObject page2: the page to be merged into this one. Should be 2468 an instance of :class:`PageObject<PageObject>`. 2469 :param float tx: The translation on X axis 2470 :param float ty: The translation on Y axis 2471 :param float rotation: The angle of the rotation, in degrees 2472 :param float scale: The scaling factor 2473 :param bool expand: Whether the page should be expanded to fit the 2474 dimensions of the page to be merged. 2475 """ 2476 translation = [[1, 0, 0], 2477 [0, 1, 0], 2478 [tx, ty, 1]] 2479 rotation = math.radians(rotation) 2480 rotating = [[math.cos(rotation), math.sin(rotation), 0], 2481 [-math.sin(rotation), math.cos(rotation), 0], 2482 [0, 0, 1]] 2483 scaling = [[scale, 0, 0], 2484 [0, scale, 0], 2485 [0, 0, 1]] 2486 ctm = utils.matrixMultiply(rotating, scaling) 2487 ctm = utils.matrixMultiply(ctm, translation) 2488 2489 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 2490 ctm[1][0], ctm[1][1], 2491 ctm[2][0], ctm[2][1]], expand) 2492 2493 ## 2494 # Applys a transformation matrix the page. 2495 # 2496 # @param ctm A 6 elements tuple containing the operands of the 2497 # transformation matrix 2498 def addTransformation(self, ctm): 2499 """ 2500 Applies a transformation matrix to the page. 2501 2502 :param tuple ctm: A 6-element tuple containing the operands of the 2503 transformation matrix. 2504 """ 2505 originalContent = self.getContents() 2506 if originalContent is not None: 2507 newContent = PageObject._addTransformationMatrix( 2508 originalContent, self.pdf, ctm) 2509 newContent = PageObject._pushPopGS(newContent, self.pdf) 2510 self[NameObject('/Contents')] = newContent 2511 2512 def scale(self, sx, sy): 2513 """ 2514 Scales a page by the given factors by appling a transformation 2515 matrix to its content and updating the page size. 2516 2517 :param float sx: The scaling factor on horizontal axis. 2518 :param float sy: The scaling factor on vertical axis. 2519 """ 2520 self.addTransformation([sx, 0, 2521 0, sy, 2522 0, 0]) 2523 self.mediaBox = RectangleObject([ 2524 float(self.mediaBox.getLowerLeft_x()) * sx, 2525 float(self.mediaBox.getLowerLeft_y()) * sy, 2526 float(self.mediaBox.getUpperRight_x()) * sx, 2527 float(self.mediaBox.getUpperRight_y()) * sy]) 2528 if "/VP" in self: 2529 viewport = self["/VP"] 2530 if isinstance(viewport, ArrayObject): 2531 bbox = viewport[0]["/BBox"] 2532 else: 2533 bbox = viewport["/BBox"] 2534 scaled_bbox = RectangleObject([ 2535 float(bbox[0]) * sx, 2536 float(bbox[1]) * sy, 2537 float(bbox[2]) * sx, 2538 float(bbox[3]) * sy]) 2539 if isinstance(viewport, ArrayObject): 2540 self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox 2541 else: 2542 self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox 2543 2544 def scaleBy(self, factor): 2545 """ 2546 Scales a page by the given factor by appling a transformation 2547 matrix to its content and updating the page size. 2548 2549 :param float factor: The scaling factor (for both X and Y axis). 2550 """ 2551 self.scale(factor, factor) 2552 2553 def scaleTo(self, width, height): 2554 """ 2555 Scales a page to the specified dimentions by appling a 2556 transformation matrix to its content and updating the page size. 2557 2558 :param float width: The new width. 2559 :param float height: The new heigth. 2560 """ 2561 sx = width / float(self.mediaBox.getUpperRight_x() - 2562 self.mediaBox.getLowerLeft_x ()) 2563 sy = height / float(self.mediaBox.getUpperRight_y() - 2564 self.mediaBox.getLowerLeft_y ()) 2565 self.scale(sx, sy) 2566 2567 def compressContentStreams(self): 2568 """ 2569 Compresses the size of this page by joining all content streams and 2570 applying a FlateDecode filter. 2571 2572 However, it is possible that this function will perform no action if 2573 content stream compression becomes "automatic" for some reason. 2574 """ 2575 content = self.getContents() 2576 if content is not None: 2577 if not isinstance(content, ContentStream): 2578 content = ContentStream(content, self.pdf) 2579 self[NameObject("/Contents")] = content.flateEncode() 2580 2581 def extractText(self): 2582 """ 2583 Locate all text drawing commands, in the order they are provided in the 2584 content stream, and extract the text. This works well for some PDF 2585 files, but poorly for others, depending on the generator used. This will 2586 be refined in the future. Do not rely on the order of text coming out of 2587 this function, as it will change if this function is made more 2588 sophisticated. 2589 2590 :return: a unicode string object. 2591 """ 2592 text = u_("") 2593 content = self["/Contents"].getObject() 2594 if not isinstance(content, ContentStream): 2595 content = ContentStream(content, self.pdf) 2596 # Note: we check all strings are TextStringObjects. ByteStringObjects 2597 # are strings where the byte->string encoding was unknown, so adding 2598 # them to the text here would be gibberish. 2599 for operands, operator in content.operations: 2600 if operator == b_("Tj"): 2601 _text = operands[0] 2602 if isinstance(_text, TextStringObject): 2603 text += _text 2604 elif operator == b_("T*"): 2605 text += "\n" 2606 elif operator == b_("'"): 2607 text += "\n" 2608 _text = operands[0] 2609 if isinstance(_text, TextStringObject): 2610 text += operands[0] 2611 elif operator == b_('"'): 2612 _text = operands[2] 2613 if isinstance(_text, TextStringObject): 2614 text += "\n" 2615 text += _text 2616 elif operator == b_("TJ"): 2617 for i in operands[0]: 2618 if isinstance(i, TextStringObject): 2619 text += i 2620 text += "\n" 2621 return text 2622 2623 mediaBox = createRectangleAccessor("/MediaBox", ()) 2624 """ 2625 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 2626 defining the boundaries of the physical medium on which the page is 2627 intended to be displayed or printed. 2628 """ 2629 2630 cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) 2631 """ 2632 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 2633 defining the visible region of default user space. When the page is 2634 displayed or printed, its contents are to be clipped (cropped) to this 2635 rectangle and then imposed on the output medium in some 2636 implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`. 2637 """ 2638 2639 bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) 2640 """ 2641 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 2642 defining the region to which the contents of the page should be clipped 2643 when output in a production enviroment. 2644 """ 2645 2646 trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) 2647 """ 2648 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 2649 defining the intended dimensions of the finished page after trimming. 2650 """ 2651 2652 artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) 2653 """ 2654 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 2655 defining the extent of the page's meaningful content as intended by the 2656 page's creator. 2657 """ 2658 2659 2660class ContentStream(DecodedStreamObject): 2661 def __init__(self, stream, pdf): 2662 self.pdf = pdf 2663 self.operations = [] 2664 # stream may be a StreamObject or an ArrayObject containing 2665 # multiple StreamObjects to be cat'd together. 2666 stream = stream.getObject() 2667 if isinstance(stream, ArrayObject): 2668 data = b_("") 2669 for s in stream: 2670 data += s.getObject().getData() 2671 stream = BytesIO(b_(data)) 2672 else: 2673 stream = BytesIO(b_(stream.getData())) 2674 self.__parseContentStream(stream) 2675 2676 def __parseContentStream(self, stream): 2677 # file("f:\\tmp.txt", "w").write(stream.read()) 2678 stream.seek(0, 0) 2679 operands = [] 2680 while True: 2681 peek = readNonWhitespace(stream) 2682 if peek == b_('') or ord_(peek) == 0: 2683 break 2684 stream.seek(-1, 1) 2685 if peek.isalpha() or peek == b_("'") or peek == b_('"'): 2686 operator = utils.readUntilRegex(stream, 2687 NameObject.delimiterPattern, True) 2688 if operator == b_("BI"): 2689 # begin inline image - a completely different parsing 2690 # mechanism is required, of course... thanks buddy... 2691 assert operands == [] 2692 ii = self._readInlineImage(stream) 2693 self.operations.append((ii, b_("INLINE IMAGE"))) 2694 else: 2695 self.operations.append((operands, operator)) 2696 operands = [] 2697 elif peek == b_('%'): 2698 # If we encounter a comment in the content stream, we have to 2699 # handle it here. Typically, readObject will handle 2700 # encountering a comment -- but readObject assumes that 2701 # following the comment must be the object we're trying to 2702 # read. In this case, it could be an operator instead. 2703 while peek not in (b_('\r'), b_('\n')): 2704 peek = stream.read(1) 2705 else: 2706 operands.append(readObject(stream, None)) 2707 2708 def _readInlineImage(self, stream): 2709 # begin reading just after the "BI" - begin image 2710 # first read the dictionary of settings. 2711 settings = DictionaryObject() 2712 while True: 2713 tok = readNonWhitespace(stream) 2714 stream.seek(-1, 1) 2715 if tok == b_("I"): 2716 # "ID" - begin of image data 2717 break 2718 key = readObject(stream, self.pdf) 2719 tok = readNonWhitespace(stream) 2720 stream.seek(-1, 1) 2721 value = readObject(stream, self.pdf) 2722 settings[key] = value 2723 # left at beginning of ID 2724 tmp = stream.read(3) 2725 assert tmp[:2] == b_("ID") 2726 data = b_("") 2727 while True: 2728 # Read the inline image, while checking for EI (End Image) operator. 2729 tok = stream.read(1) 2730 if tok == b_("E"): 2731 # Check for End Image 2732 tok2 = stream.read(1) 2733 if tok2 == b_("I"): 2734 # Data can contain EI, so check for the Q operator. 2735 tok3 = stream.read(1) 2736 info = tok + tok2 2737 # We need to find whitespace between EI and Q. 2738 has_q_whitespace = False 2739 while tok3 in utils.WHITESPACES: 2740 has_q_whitespace = True 2741 info += tok3 2742 tok3 = stream.read(1) 2743 if tok3 == b_("Q") and has_q_whitespace: 2744 stream.seek(-1, 1) 2745 break 2746 else: 2747 stream.seek(-1,1) 2748 data += info 2749 else: 2750 stream.seek(-1, 1) 2751 data += tok 2752 else: 2753 data += tok 2754 return {"settings": settings, "data": data} 2755 2756 def _getData(self): 2757 newdata = BytesIO() 2758 for operands, operator in self.operations: 2759 if operator == b_("INLINE IMAGE"): 2760 newdata.write(b_("BI")) 2761 dicttext = BytesIO() 2762 operands["settings"].writeToStream(dicttext, None) 2763 newdata.write(dicttext.getvalue()[2:-2]) 2764 newdata.write(b_("ID ")) 2765 newdata.write(operands["data"]) 2766 newdata.write(b_("EI")) 2767 else: 2768 for op in operands: 2769 op.writeToStream(newdata, None) 2770 newdata.write(b_(" ")) 2771 newdata.write(b_(operator)) 2772 newdata.write(b_("\n")) 2773 return newdata.getvalue() 2774 2775 def _setData(self, value): 2776 self.__parseContentStream(BytesIO(b_(value))) 2777 2778 _data = property(_getData, _setData) 2779 2780 2781class DocumentInformation(DictionaryObject): 2782 """ 2783 A class representing the basic document metadata provided in a PDF File. 2784 This class is accessible through 2785 :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>` 2786 2787 All text properties of the document metadata have 2788 *two* properties, eg. author and author_raw. The non-raw property will 2789 always return a ``TextStringObject``, making it ideal for a case where 2790 the metadata is being displayed. The raw property can sometimes return 2791 a ``ByteStringObject``, if PyPDF2 was unable to decode the string's 2792 text encoding; this requires additional safety in the caller and 2793 therefore is not as commonly accessed. 2794 """ 2795 2796 def __init__(self): 2797 DictionaryObject.__init__(self) 2798 2799 def getText(self, key): 2800 retval = self.get(key, None) 2801 if isinstance(retval, TextStringObject): 2802 return retval 2803 return None 2804 2805 title = property(lambda self: self.getText("/Title")) 2806 """Read-only property accessing the document's **title**. 2807 Returns a unicode string (``TextStringObject``) or ``None`` 2808 if the title is not specified.""" 2809 title_raw = property(lambda self: self.get("/Title")) 2810 """The "raw" version of title; can return a ``ByteStringObject``.""" 2811 2812 author = property(lambda self: self.getText("/Author")) 2813 """Read-only property accessing the document's **author**. 2814 Returns a unicode string (``TextStringObject``) or ``None`` 2815 if the author is not specified.""" 2816 author_raw = property(lambda self: self.get("/Author")) 2817 """The "raw" version of author; can return a ``ByteStringObject``.""" 2818 2819 subject = property(lambda self: self.getText("/Subject")) 2820 """Read-only property accessing the document's **subject**. 2821 Returns a unicode string (``TextStringObject``) or ``None`` 2822 if the subject is not specified.""" 2823 subject_raw = property(lambda self: self.get("/Subject")) 2824 """The "raw" version of subject; can return a ``ByteStringObject``.""" 2825 2826 creator = property(lambda self: self.getText("/Creator")) 2827 """Read-only property accessing the document's **creator**. If the 2828 document was converted to PDF from another format, this is the name of the 2829 application (e.g. OpenOffice) that created the original document from 2830 which it was converted. Returns a unicode string (``TextStringObject``) 2831 or ``None`` if the creator is not specified.""" 2832 creator_raw = property(lambda self: self.get("/Creator")) 2833 """The "raw" version of creator; can return a ``ByteStringObject``.""" 2834 2835 producer = property(lambda self: self.getText("/Producer")) 2836 """Read-only property accessing the document's **producer**. 2837 If the document was converted to PDF from another format, this is 2838 the name of the application (for example, OSX Quartz) that converted 2839 it to PDF. Returns a unicode string (``TextStringObject``) 2840 or ``None`` if the producer is not specified.""" 2841 producer_raw = property(lambda self: self.get("/Producer")) 2842 """The "raw" version of producer; can return a ``ByteStringObject``.""" 2843 2844 2845def convertToInt(d, size): 2846 if size > 8: 2847 raise utils.PdfReadError("invalid size in convertToInt") 2848 d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) 2849 d = d[-8:] 2850 return struct.unpack(">q", d)[0] 2851 2852# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 2853_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \ 2854 b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ 2855 b_('\xa9\xfe\x64\x53\x69\x7a') 2856 2857 2858# Implementation of algorithm 3.2 of the PDF standard security handler, 2859# section 3.5.2 of the PDF 1.6 reference. 2860def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): 2861 # 1. Pad or truncate the password string to exactly 32 bytes. If the 2862 # password string is more than 32 bytes long, use only its first 32 bytes; 2863 # if it is less than 32 bytes long, pad it by appending the required number 2864 # of additional bytes from the beginning of the padding string 2865 # (_encryption_padding). 2866 password = b_((str_(password) + str_(_encryption_padding))[:32]) 2867 # 2. Initialize the MD5 hash function and pass the result of step 1 as 2868 # input to this function. 2869 import struct 2870 m = md5(password) 2871 # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash 2872 # function. 2873 m.update(owner_entry.original_bytes) 2874 # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass 2875 # these bytes to the MD5 hash function, low-order byte first. 2876 p_entry = struct.pack('<i', p_entry) 2877 m.update(p_entry) 2878 # 5. Pass the first element of the file's file identifier array to the MD5 2879 # hash function. 2880 m.update(id1_entry.original_bytes) 2881 # 6. (Revision 3 or greater) If document metadata is not being encrypted, 2882 # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. 2883 if rev >= 3 and not metadata_encrypt: 2884 m.update(b_("\xff\xff\xff\xff")) 2885 # 7. Finish the hash. 2886 md5_hash = m.digest() 2887 # 8. (Revision 3 or greater) Do the following 50 times: Take the output 2888 # from the previous MD5 hash and pass the first n bytes of the output as 2889 # input into a new MD5 hash, where n is the number of bytes of the 2890 # encryption key as defined by the value of the encryption dictionary's 2891 # /Length entry. 2892 if rev >= 3: 2893 for i in range(50): 2894 md5_hash = md5(md5_hash[:keylen]).digest() 2895 # 9. Set the encryption key to the first n bytes of the output from the 2896 # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or 2897 # greater, depends on the value of the encryption dictionary's /Length 2898 # entry. 2899 return md5_hash[:keylen] 2900 2901 2902# Implementation of algorithm 3.3 of the PDF standard security handler, 2903# section 3.5.2 of the PDF 1.6 reference. 2904def _alg33(owner_pwd, user_pwd, rev, keylen): 2905 # steps 1 - 4 2906 key = _alg33_1(owner_pwd, rev, keylen) 2907 # 5. Pad or truncate the user password string as described in step 1 of 2908 # algorithm 3.2. 2909 user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) 2910 # 6. Encrypt the result of step 5, using an RC4 encryption function with 2911 # the encryption key obtained in step 4. 2912 val = utils.RC4_encrypt(key, user_pwd) 2913 # 7. (Revision 3 or greater) Do the following 19 times: Take the output 2914 # from the previous invocation of the RC4 function and pass it as input to 2915 # a new invocation of the function; use an encryption key generated by 2916 # taking each byte of the encryption key obtained in step 4 and performing 2917 # an XOR operation between that byte and the single-byte value of the 2918 # iteration counter (from 1 to 19). 2919 if rev >= 3: 2920 for i in range(1, 20): 2921 new_key = '' 2922 for l in range(len(key)): 2923 new_key += chr(ord_(key[l]) ^ i) 2924 val = utils.RC4_encrypt(new_key, val) 2925 # 8. Store the output from the final invocation of the RC4 as the value of 2926 # the /O entry in the encryption dictionary. 2927 return val 2928 2929 2930# Steps 1-4 of algorithm 3.3 2931def _alg33_1(password, rev, keylen): 2932 # 1. Pad or truncate the owner password string as described in step 1 of 2933 # algorithm 3.2. If there is no owner password, use the user password 2934 # instead. 2935 password = b_((password + str_(_encryption_padding))[:32]) 2936 # 2. Initialize the MD5 hash function and pass the result of step 1 as 2937 # input to this function. 2938 m = md5(password) 2939 # 3. (Revision 3 or greater) Do the following 50 times: Take the output 2940 # from the previous MD5 hash and pass it as input into a new MD5 hash. 2941 md5_hash = m.digest() 2942 if rev >= 3: 2943 for i in range(50): 2944 md5_hash = md5(md5_hash).digest() 2945 # 4. Create an RC4 encryption key using the first n bytes of the output 2946 # from the final MD5 hash, where n is always 5 for revision 2 but, for 2947 # revision 3 or greater, depends on the value of the encryption 2948 # dictionary's /Length entry. 2949 key = md5_hash[:keylen] 2950 return key 2951 2952 2953# Implementation of algorithm 3.4 of the PDF standard security handler, 2954# section 3.5.2 of the PDF 1.6 reference. 2955def _alg34(password, owner_entry, p_entry, id1_entry): 2956 # 1. Create an encryption key based on the user password string, as 2957 # described in algorithm 3.2. 2958 key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) 2959 # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, 2960 # using an RC4 encryption function with the encryption key from the 2961 # preceding step. 2962 U = utils.RC4_encrypt(key, _encryption_padding) 2963 # 3. Store the result of step 2 as the value of the /U entry in the 2964 # encryption dictionary. 2965 return U, key 2966 2967 2968# Implementation of algorithm 3.4 of the PDF standard security handler, 2969# section 3.5.2 of the PDF 1.6 reference. 2970def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): 2971 # 1. Create an encryption key based on the user password string, as 2972 # described in Algorithm 3.2. 2973 key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) 2974 # 2. Initialize the MD5 hash function and pass the 32-byte padding string 2975 # shown in step 1 of Algorithm 3.2 as input to this function. 2976 m = md5() 2977 m.update(_encryption_padding) 2978 # 3. Pass the first element of the file's file identifier array (the value 2979 # of the ID entry in the document's trailer dictionary; see Table 3.13 on 2980 # page 73) to the hash function and finish the hash. (See implementation 2981 # note 25 in Appendix H.) 2982 m.update(id1_entry.original_bytes) 2983 md5_hash = m.digest() 2984 # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption 2985 # function with the encryption key from step 1. 2986 val = utils.RC4_encrypt(key, md5_hash) 2987 # 5. Do the following 19 times: Take the output from the previous 2988 # invocation of the RC4 function and pass it as input to a new invocation 2989 # of the function; use an encryption key generated by taking each byte of 2990 # the original encryption key (obtained in step 2) and performing an XOR 2991 # operation between that byte and the single-byte value of the iteration 2992 # counter (from 1 to 19). 2993 for i in range(1, 20): 2994 new_key = b_('') 2995 for l in range(len(key)): 2996 new_key += b_(chr(ord_(key[l]) ^ i)) 2997 val = utils.RC4_encrypt(new_key, val) 2998 # 6. Append 16 bytes of arbitrary padding to the output from the final 2999 # invocation of the RC4 function and store the 32-byte result as the value 3000 # of the U entry in the encryption dictionary. 3001 # (implementator note: I don't know what "arbitrary padding" is supposed to 3002 # mean, so I have used null bytes. This seems to match a few other 3003 # people's implementations) 3004 return val + (b_('\x00') * 16), key 3005