1# -*- coding: utf-8 -*- 2# 3# vim: sw=4:expandtab:foldmethod=marker 4# 5# Copyright (c) 2006, Mathieu Fenniak 6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 7# 8# All rights reserved. 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright notice, 15# this list of conditions and the following disclaimer. 16# * Redistributions in binary form must reproduce the above copyright notice, 17# this list of conditions and the following disclaimer in the documentation 18# and/or other materials provided with the distribution. 19# * The name of the author may not be used to endorse or promote products 20# derived from this software without specific prior written permission. 21# 22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32# POSSIBILITY OF SUCH DAMAGE. 33 34 35""" 36A pure-Python PDF library with very minimal capabilities. It was designed to 37be able to split and merge PDF files by page, and that's about all it can do. 38It may be a solid base for future PDF file work in Python. 39""" 40__author__ = "Mathieu Fenniak" 41__author_email__ = "biziqe@mathieu.fenniak.net" 42 43import math 44import struct 45from sys import version_info 46try: 47 from cStringIO import StringIO 48except ImportError: 49 from StringIO import StringIO 50 51import filters 52import utils 53import warnings 54from generic import * 55from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList 56 57if version_info < ( 2, 4 ): 58 from sets import ImmutableSet as frozenset 59 60if version_info < ( 2, 5 ): 61 from md5 import md5 62else: 63 from hashlib import md5 64 65## 66# This class supports writing PDF files out, given pages produced by another 67# class (typically {@link #PdfFileReader PdfFileReader}). 68class PdfFileWriter(object): 69 def __init__(self): 70 self._header = "%PDF-1.3" 71 self._objects = [] # array of indirect objects 72 73 # The root of our page tree node. 74 pages = DictionaryObject() 75 pages.update({ 76 NameObject("/Type"): NameObject("/Pages"), 77 NameObject("/Count"): NumberObject(0), 78 NameObject("/Kids"): ArrayObject(), 79 }) 80 self._pages = self._addObject(pages) 81 82 # info object 83 info = DictionaryObject() 84 info.update({ 85 NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/") 86 }) 87 self._info = self._addObject(info) 88 89 # root object 90 root = DictionaryObject() 91 root.update({ 92 NameObject("/Type"): NameObject("/Catalog"), 93 NameObject("/Pages"): self._pages, 94 }) 95 self._root = self._addObject(root) 96 97 def _addObject(self, obj): 98 self._objects.append(obj) 99 return IndirectObject(len(self._objects), 0, self) 100 101 def getObject(self, ido): 102 if ido.pdf != self: 103 raise ValueError("pdf must be self") 104 return self._objects[ido.idnum - 1] 105 106 ## 107 # Common method for inserting or adding a page to this PDF file. 108 # 109 # @param page The page to add to the document. This argument should be 110 # an instance of {@link #PageObject PageObject}. 111 # @param action The function which will insert the page in the dictionnary. 112 # Takes: page list, page to add. 113 def _addPage(self, page, action): 114 assert page["/Type"] == "/Page" 115 page[NameObject("/Parent")] = self._pages 116 page = self._addObject(page) 117 pages = self.getObject(self._pages) 118 action(pages["/Kids"], page) 119 pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) 120 121 ## 122 # Adds a page to this PDF file. The page is usually acquired from a 123 # {@link #PdfFileReader PdfFileReader} instance. 124 # <p> 125 # Stability: Added in v1.0, will exist for all v1.x releases. 126 # 127 # @param page The page to add to the document. This argument should be 128 # an instance of {@link #PageObject PageObject}. 129 def addPage(self, page): 130 self._addPage(page, list.append) 131 132 ## 133 # Insert a page in this PDF file. The page is usually acquired from a 134 # {@link #PdfFileReader PdfFileReader} instance. 135 # 136 # @param page The page to add to the document. This argument should be 137 # an instance of {@link #PageObject PageObject}. 138 # @param index Position at which the page will be inserted. 139 def insertPage(self, page, index=0): 140 self._addPage(page, lambda l, p: l.insert(index, p)) 141 142 ## 143 # Retrieves a page by number from this PDF file. 144 # @return Returns a {@link #PageObject PageObject} instance. 145 def getPage(self, pageNumber): 146 pages = self.getObject(self._pages) 147 # XXX: crude hack 148 return pages["/Kids"][pageNumber].getObject() 149 150 ## 151 # Return the number of pages. 152 # @return The number of pages. 153 def getNumPages(self): 154 pages = self.getObject(self._pages) 155 return int(pages[NameObject("/Count")]) 156 157 ## 158 # Append a blank page to this PDF file and returns it. If no page size 159 # is specified, use the size of the last page; throw 160 # PageSizeNotDefinedError if it doesn't exist. 161 # @param width The width of the new page expressed in default user 162 # space units. 163 # @param height The height of the new page expressed in default user 164 # space units. 165 def addBlankPage(self, width=None, height=None): 166 page = PageObject.createBlankPage(self, width, height) 167 self.addPage(page) 168 return page 169 170 ## 171 # Insert a blank page to this PDF file and returns it. If no page size 172 # is specified, use the size of the page in the given index; throw 173 # PageSizeNotDefinedError if it doesn't exist. 174 # @param width The width of the new page expressed in default user 175 # space units. 176 # @param height The height of the new page expressed in default user 177 # space units. 178 # @param index Position to add the page. 179 def insertBlankPage(self, width=None, height=None, index=0): 180 if width is None or height is None and \ 181 (self.getNumPages() - 1) >= index: 182 oldpage = self.getPage(index) 183 width = oldpage.mediaBox.getWidth() 184 height = oldpage.mediaBox.getHeight() 185 page = PageObject.createBlankPage(self, width, height) 186 self.insertPage(page, index) 187 return page 188 189 ## 190 # Encrypt this PDF file with the PDF Standard encryption handler. 191 # @param user_pwd The "user password", which allows for opening and reading 192 # the PDF file with the restrictions provided. 193 # @param owner_pwd The "owner password", which allows for opening the PDF 194 # files without any restrictions. By default, the owner password is the 195 # same as the user password. 196 # @param use_128bit Boolean argument as to whether to use 128bit 197 # encryption. When false, 40bit encryption will be used. By default, this 198 # flag is on. 199 def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): 200 import time, random 201 if owner_pwd == None: 202 owner_pwd = user_pwd 203 if use_128bit: 204 V = 2 205 rev = 3 206 keylen = 128 / 8 207 else: 208 V = 1 209 rev = 2 210 keylen = 40 / 8 211 # permit everything: 212 P = -1 213 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) 214 ID_1 = md5(repr(time.time())).digest() 215 ID_2 = md5(repr(random.random())).digest() 216 self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2))) 217 if rev == 2: 218 U, key = _alg34(user_pwd, O, P, ID_1) 219 else: 220 assert rev == 3 221 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) 222 encrypt = DictionaryObject() 223 encrypt[NameObject("/Filter")] = NameObject("/Standard") 224 encrypt[NameObject("/V")] = NumberObject(V) 225 if V == 2: 226 encrypt[NameObject("/Length")] = NumberObject(keylen * 8) 227 encrypt[NameObject("/R")] = NumberObject(rev) 228 encrypt[NameObject("/O")] = ByteStringObject(O) 229 encrypt[NameObject("/U")] = ByteStringObject(U) 230 encrypt[NameObject("/P")] = NumberObject(P) 231 self._encrypt = self._addObject(encrypt) 232 self._encrypt_key = key 233 234 ## 235 # Writes the collection of pages added to this object out as a PDF file. 236 # <p> 237 # Stability: Added in v1.0, will exist for all v1.x releases. 238 # @param stream An object to write the file to. The object must support 239 # the write method, and the tell method, similar to a file object. 240 def write(self, stream): 241 import struct 242 243 externalReferenceMap = {} 244 245 # PDF objects sometimes have circular references to their /Page objects 246 # inside their object tree (for example, annotations). Those will be 247 # indirect references to objects that we've recreated in this PDF. To 248 # address this problem, PageObject's store their original object 249 # reference number, and we add it to the external reference map before 250 # we sweep for indirect references. This forces self-page-referencing 251 # trees to reference the correct new object location, rather than 252 # copying in a new copy of the page object. 253 for objIndex in xrange(len(self._objects)): 254 obj = self._objects[objIndex] 255 if isinstance(obj, PageObject) and obj.indirectRef != None: 256 data = obj.indirectRef 257 if not externalReferenceMap.has_key(data.pdf): 258 externalReferenceMap[data.pdf] = {} 259 if not externalReferenceMap[data.pdf].has_key(data.generation): 260 externalReferenceMap[data.pdf][data.generation] = {} 261 externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) 262 263 self.stack = [] 264 self._sweepIndirectReferences(externalReferenceMap, self._root) 265 del self.stack 266 267 # Begin writing: 268 object_positions = [] 269 stream.write(self._header + "\n") 270 for i in range(len(self._objects)): 271 idnum = (i + 1) 272 obj = self._objects[i] 273 object_positions.append(stream.tell()) 274 stream.write(str(idnum) + " 0 obj\n") 275 key = None 276 if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: 277 pack1 = struct.pack("<i", i + 1)[:3] 278 pack2 = struct.pack("<i", 0)[:2] 279 key = self._encrypt_key + pack1 + pack2 280 assert len(key) == (len(self._encrypt_key) + 5) 281 md5_hash = md5(key).digest() 282 key = md5_hash[:min(16, len(self._encrypt_key) + 5)] 283 obj.writeToStream(stream, key) 284 stream.write("\nendobj\n") 285 286 # xref table 287 xref_location = stream.tell() 288 stream.write("xref\n") 289 stream.write("0 %s\n" % (len(self._objects) + 1)) 290 stream.write("%010d %05d f \n" % (0, 65535)) 291 for offset in object_positions: 292 stream.write("%010d %05d n \n" % (offset, 0)) 293 294 # trailer 295 stream.write("trailer\n") 296 trailer = DictionaryObject() 297 trailer.update({ 298 NameObject("/Size"): NumberObject(len(self._objects) + 1), 299 NameObject("/Root"): self._root, 300 NameObject("/Info"): self._info, 301 }) 302 if hasattr(self, "_ID"): 303 trailer[NameObject("/ID")] = self._ID 304 if hasattr(self, "_encrypt"): 305 trailer[NameObject("/Encrypt")] = self._encrypt 306 trailer.writeToStream(stream, None) 307 308 # eof 309 stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) 310 311 def _sweepIndirectReferences(self, externMap, data): 312 if isinstance(data, DictionaryObject): 313 for key, value in data.items(): 314 origvalue = value 315 value = self._sweepIndirectReferences(externMap, value) 316 if isinstance(value, StreamObject): 317 # a dictionary value is a stream. streams must be indirect 318 # objects, so we need to change this value. 319 value = self._addObject(value) 320 data[key] = value 321 return data 322 elif isinstance(data, ArrayObject): 323 for i in range(len(data)): 324 value = self._sweepIndirectReferences(externMap, data[i]) 325 if isinstance(value, StreamObject): 326 # an array value is a stream. streams must be indirect 327 # objects, so we need to change this value 328 value = self._addObject(value) 329 data[i] = value 330 return data 331 elif isinstance(data, IndirectObject): 332 # internal indirect references are fine 333 if data.pdf == self: 334 if data.idnum in self.stack: 335 return data 336 else: 337 self.stack.append(data.idnum) 338 realdata = self.getObject(data) 339 self._sweepIndirectReferences(externMap, realdata) 340 self.stack.pop() 341 return data 342 else: 343 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) 344 if newobj == None: 345 newobj = data.pdf.getObject(data) 346 self._objects.append(None) # placeholder 347 idnum = len(self._objects) 348 newobj_ido = IndirectObject(idnum, 0, self) 349 if not externMap.has_key(data.pdf): 350 externMap[data.pdf] = {} 351 if not externMap[data.pdf].has_key(data.generation): 352 externMap[data.pdf][data.generation] = {} 353 externMap[data.pdf][data.generation][data.idnum] = newobj_ido 354 newobj = self._sweepIndirectReferences(externMap, newobj) 355 self._objects[idnum-1] = newobj 356 return newobj_ido 357 return newobj 358 else: 359 return data 360 361 362## 363# Initializes a PdfFileReader object. This operation can take some time, as 364# the PDF stream's cross-reference tables are read into memory. 365# <p> 366# Stability: Added in v1.0, will exist for all v1.x releases. 367# 368# @param stream An object that supports the standard read and seek methods 369# similar to a file object. 370class PdfFileReader(object): 371 def __init__(self, stream): 372 self.flattenedPages = None 373 self.resolvedObjects = {} 374 self.read(stream) 375 self.stream = stream 376 self._override_encryption = False 377 378 ## 379 # Retrieves the PDF file's document information dictionary, if it exists. 380 # Note that some PDF files use metadata streams instead of docinfo 381 # dictionaries, and these metadata streams will not be accessed by this 382 # function. 383 # <p> 384 # Stability: Added in v1.6, will exist for all future v1.x releases. 385 # @return Returns a {@link #DocumentInformation DocumentInformation} 386 # instance, or None if none exists. 387 def getDocumentInfo(self): 388 if not self.trailer.has_key("/Info"): 389 return None 390 obj = self.trailer['/Info'] 391 retval = DocumentInformation() 392 retval.update(obj) 393 return retval 394 395 ## 396 # Read-only property that accesses the {@link 397 # #PdfFileReader.getDocumentInfo getDocumentInfo} function. 398 # <p> 399 # Stability: Added in v1.7, will exist for all future v1.x releases. 400 documentInfo = property(lambda self: self.getDocumentInfo(), None, None) 401 402 ## 403 # Retrieves XMP (Extensible Metadata Platform) data from the PDF document 404 # root. 405 # <p> 406 # Stability: Added in v1.12, will exist for all future v1.x releases. 407 # @return Returns a {@link #generic.XmpInformation XmlInformation} 408 # instance that can be used to access XMP metadata from the document. 409 # Can also return None if no metadata was found on the document root. 410 def getXmpMetadata(self): 411 try: 412 self._override_encryption = True 413 return self.trailer["/Root"].getXmpMetadata() 414 finally: 415 self._override_encryption = False 416 417 ## 418 # Read-only property that accesses the {@link #PdfFileReader.getXmpData 419 # getXmpData} function. 420 # <p> 421 # Stability: Added in v1.12, will exist for all future v1.x releases. 422 xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) 423 424 ## 425 # Calculates the number of pages in this PDF file. 426 # <p> 427 # Stability: Added in v1.0, will exist for all v1.x releases. 428 # @return Returns an integer. 429 def getNumPages(self): 430 if self.flattenedPages == None: 431 self._flatten() 432 return len(self.flattenedPages) 433 434 ## 435 # Read-only property that accesses the {@link #PdfFileReader.getNumPages 436 # getNumPages} function. 437 # <p> 438 # Stability: Added in v1.7, will exist for all future v1.x releases. 439 numPages = property(lambda self: self.getNumPages(), None, None) 440 441 ## 442 # Retrieves a page by number from this PDF file. 443 # <p> 444 # Stability: Added in v1.0, will exist for all v1.x releases. 445 # @return Returns a {@link #PageObject PageObject} instance. 446 def getPage(self, pageNumber): 447 ## ensure that we're not trying to access an encrypted PDF 448 #assert not self.trailer.has_key("/Encrypt") 449 if self.flattenedPages == None: 450 self._flatten() 451 return self.flattenedPages[pageNumber] 452 453 ## 454 # Read-only property that accesses the 455 # {@link #PdfFileReader.getNamedDestinations 456 # getNamedDestinations} function. 457 # <p> 458 # Stability: Added in v1.10, will exist for all future v1.x releases. 459 namedDestinations = property(lambda self: 460 self.getNamedDestinations(), None, None) 461 462 ## 463 # Retrieves the named destinations present in the document. 464 # <p> 465 # Stability: Added in v1.10, will exist for all future v1.x releases. 466 # @return Returns a dict which maps names to {@link #Destination 467 # destinations}. 468 def getNamedDestinations(self, tree=None, retval=None): 469 if retval == None: 470 retval = {} 471 catalog = self.trailer["/Root"] 472 473 # get the name tree 474 if catalog.has_key("/Dests"): 475 tree = catalog["/Dests"] 476 elif catalog.has_key("/Names"): 477 names = catalog['/Names'] 478 if names.has_key("/Dests"): 479 tree = names['/Dests'] 480 481 if tree == None: 482 return retval 483 484 if tree.has_key("/Kids"): 485 # recurse down the tree 486 for kid in tree["/Kids"]: 487 self.getNamedDestinations(kid.getObject(), retval) 488 489 if tree.has_key("/Names"): 490 names = tree["/Names"] 491 for i in range(0, len(names), 2): 492 key = names[i].getObject() 493 val = names[i+1].getObject() 494 if isinstance(val, DictionaryObject) and val.has_key('/D'): 495 val = val['/D'] 496 dest = self._buildDestination(key, val) 497 if dest != None: 498 retval[key] = dest 499 500 return retval 501 502 ## 503 # Read-only property that accesses the {@link #PdfFileReader.getOutlines 504 # getOutlines} function. 505 # <p> 506 # Stability: Added in v1.10, will exist for all future v1.x releases. 507 outlines = property(lambda self: self.getOutlines(), None, None) 508 509 ## 510 # Retrieves the document outline present in the document. 511 # <p> 512 # Stability: Added in v1.10, will exist for all future v1.x releases. 513 # @return Returns a nested list of {@link #Destination destinations}. 514 def getOutlines(self, node=None, outlines=None): 515 if outlines == None: 516 outlines = [] 517 catalog = self.trailer["/Root"] 518 519 # get the outline dictionary and named destinations 520 if catalog.has_key("/Outlines"): 521 lines = catalog["/Outlines"] 522 if lines.has_key("/First"): 523 node = lines["/First"] 524 self._namedDests = self.getNamedDestinations() 525 526 if node == None: 527 return outlines 528 529 # see if there are any more outlines 530 while 1: 531 outline = self._buildOutline(node) 532 if outline: 533 outlines.append(outline) 534 535 # check for sub-outlines 536 if node.has_key("/First"): 537 subOutlines = [] 538 self.getOutlines(node["/First"], subOutlines) 539 if subOutlines: 540 outlines.append(subOutlines) 541 542 if not node.has_key("/Next"): 543 break 544 node = node["/Next"] 545 546 return outlines 547 548 def _buildDestination(self, title, array): 549 page, typ = array[0:2] 550 array = array[2:] 551 return Destination(title, page, typ, *array) 552 553 def _buildOutline(self, node): 554 dest, title, outline = None, None, None 555 556 if node.has_key("/A") and node.has_key("/Title"): 557 # Action, section 8.5 (only type GoTo supported) 558 title = node["/Title"] 559 action = node["/A"] 560 if action["/S"] == "/GoTo": 561 dest = action["/D"] 562 elif node.has_key("/Dest") and node.has_key("/Title"): 563 # Destination, section 8.2.1 564 title = node["/Title"] 565 dest = node["/Dest"] 566 567 # if destination found, then create outline 568 if dest: 569 if isinstance(dest, ArrayObject): 570 outline = self._buildDestination(title, dest) 571 elif isinstance(dest, unicode) and self._namedDests.has_key(dest): 572 outline = self._namedDests[dest] 573 outline[NameObject("/Title")] = title 574 else: 575 raise utils.PdfReadError("Unexpected destination %r" % dest) 576 return outline 577 578 ## 579 # Read-only property that emulates a list based upon the {@link 580 # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage 581 # getPage} functions. 582 # <p> 583 # Stability: Added in v1.7, and will exist for all future v1.x releases. 584 pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), 585 None, None) 586 587 def _flatten(self, pages=None, inherit=None, indirectRef=None): 588 inheritablePageAttributes = ( 589 NameObject("/Resources"), NameObject("/MediaBox"), 590 NameObject("/CropBox"), NameObject("/Rotate") 591 ) 592 if inherit == None: 593 inherit = dict() 594 if pages == None: 595 self.flattenedPages = [] 596 catalog = self.trailer["/Root"].getObject() 597 pages = catalog["/Pages"].getObject() 598 t = pages["/Type"] 599 if t == "/Pages": 600 for attr in inheritablePageAttributes: 601 if pages.has_key(attr): 602 inherit[attr] = pages[attr] 603 for page in pages["/Kids"]: 604 addt = {} 605 if isinstance(page, IndirectObject): 606 addt["indirectRef"] = page 607 self._flatten(page.getObject(), inherit, **addt) 608 elif t == "/Page": 609 for attr,value in inherit.items(): 610 # if the page has it's own value, it does not inherit the 611 # parent's value: 612 if not pages.has_key(attr): 613 pages[attr] = value 614 pageObj = PageObject(self, indirectRef) 615 pageObj.update(pages) 616 self.flattenedPages.append(pageObj) 617 618 def getObject(self, indirectReference): 619 retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) 620 if retval != None: 621 return retval 622 if indirectReference.generation == 0 and \ 623 self.xref_objStm.has_key(indirectReference.idnum): 624 # indirect reference to object in object stream 625 # read the entire object stream into memory 626 stmnum,idx = self.xref_objStm[indirectReference.idnum] 627 objStm = IndirectObject(stmnum, 0, self).getObject() 628 assert objStm['/Type'] == '/ObjStm' 629 assert idx < objStm['/N'] 630 streamData = StringIO(objStm.getData()) 631 for i in range(objStm['/N']): 632 objnum = NumberObject.readFromStream(streamData) 633 readNonWhitespace(streamData) 634 streamData.seek(-1, 1) 635 offset = NumberObject.readFromStream(streamData) 636 readNonWhitespace(streamData) 637 streamData.seek(-1, 1) 638 t = streamData.tell() 639 streamData.seek(objStm['/First']+offset, 0) 640 obj = readObject(streamData, self) 641 self.resolvedObjects[0][objnum] = obj 642 streamData.seek(t, 0) 643 return self.resolvedObjects[0][indirectReference.idnum] 644 start = self.xref[indirectReference.generation][indirectReference.idnum] 645 self.stream.seek(start, 0) 646 idnum, generation = self.readObjectHeader(self.stream) 647 assert idnum == indirectReference.idnum 648 assert generation == indirectReference.generation 649 retval = readObject(self.stream, self) 650 651 # override encryption is used for the /Encrypt dictionary 652 if not self._override_encryption and self.isEncrypted: 653 # if we don't have the encryption key: 654 if not hasattr(self, '_decryption_key'): 655 raise Exception, "file has not been decrypted" 656 # otherwise, decrypt here... 657 import struct 658 pack1 = struct.pack("<i", indirectReference.idnum)[:3] 659 pack2 = struct.pack("<i", indirectReference.generation)[:2] 660 key = self._decryption_key + pack1 + pack2 661 assert len(key) == (len(self._decryption_key) + 5) 662 md5_hash = md5(key).digest() 663 key = md5_hash[:min(16, len(self._decryption_key) + 5)] 664 retval = self._decryptObject(retval, key) 665 666 self.cacheIndirectObject(generation, idnum, retval) 667 return retval 668 669 def _decryptObject(self, obj, key): 670 if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): 671 obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) 672 elif isinstance(obj, StreamObject): 673 obj._data = utils.RC4_encrypt(key, obj._data) 674 elif isinstance(obj, DictionaryObject): 675 for dictkey, value in obj.items(): 676 obj[dictkey] = self._decryptObject(value, key) 677 elif isinstance(obj, ArrayObject): 678 for i in range(len(obj)): 679 obj[i] = self._decryptObject(obj[i], key) 680 return obj 681 682 def readObjectHeader(self, stream): 683 # Should never be necessary to read out whitespace, since the 684 # cross-reference table should put us in the right spot to read the 685 # object header. In reality... some files have stupid cross reference 686 # tables that are off by whitespace bytes. 687 readNonWhitespace(stream); stream.seek(-1, 1) 688 idnum = readUntilWhitespace(stream) 689 generation = readUntilWhitespace(stream) 690 obj = stream.read(3) 691 readNonWhitespace(stream) 692 stream.seek(-1, 1) 693 return int(idnum), int(generation) 694 695 def cacheIndirectObject(self, generation, idnum, obj): 696 if not self.resolvedObjects.has_key(generation): 697 self.resolvedObjects[generation] = {} 698 self.resolvedObjects[generation][idnum] = obj 699 700 def read(self, stream): 701 # start at the end: 702 stream.seek(-1, 2) 703 line = '' 704 while not line: 705 line = self.readNextEndLine(stream) 706 if line[:5] != "%%EOF": 707 raise utils.PdfReadError, "EOF marker not found" 708 709 # find startxref entry - the location of the xref table 710 line = self.readNextEndLine(stream) 711 startxref = int(line) 712 line = self.readNextEndLine(stream) 713 if line[:9] != "startxref": 714 raise utils.PdfReadError, "startxref not found" 715 716 # read all cross reference tables and their trailers 717 self.xref = {} 718 self.xref_objStm = {} 719 self.trailer = DictionaryObject() 720 while 1: 721 # load the xref table 722 stream.seek(startxref, 0) 723 x = stream.read(1) 724 if x == "x": 725 # standard cross-reference table 726 ref = stream.read(4) 727 if ref[:3] != "ref": 728 raise utils.PdfReadError, "xref table read error" 729 readNonWhitespace(stream) 730 stream.seek(-1, 1) 731 while 1: 732 num = readObject(stream, self) 733 readNonWhitespace(stream) 734 stream.seek(-1, 1) 735 size = readObject(stream, self) 736 readNonWhitespace(stream) 737 stream.seek(-1, 1) 738 cnt = 0 739 while cnt < size: 740 line = stream.read(20) 741 # It's very clear in section 3.4.3 of the PDF spec 742 # that all cross-reference table lines are a fixed 743 # 20 bytes. However... some malformed PDF files 744 # use a single character EOL without a preceeding 745 # space. Detect that case, and seek the stream 746 # back one character. (0-9 means we've bled into 747 # the next xref entry, t means we've bled into the 748 # text "trailer"): 749 if line[-1] in "0123456789t": 750 stream.seek(-1, 1) 751 offset, generation = line[:16].split(" ") 752 offset, generation = int(offset), int(generation) 753 if not self.xref.has_key(generation): 754 self.xref[generation] = {} 755 if self.xref[generation].has_key(num): 756 # It really seems like we should allow the last 757 # xref table in the file to override previous 758 # ones. Since we read the file backwards, assume 759 # any existing key is already set correctly. 760 pass 761 else: 762 self.xref[generation][num] = offset 763 cnt += 1 764 num += 1 765 readNonWhitespace(stream) 766 stream.seek(-1, 1) 767 trailertag = stream.read(7) 768 if trailertag != "trailer": 769 # more xrefs! 770 stream.seek(-7, 1) 771 else: 772 break 773 readNonWhitespace(stream) 774 stream.seek(-1, 1) 775 newTrailer = readObject(stream, self) 776 for key, value in newTrailer.items(): 777 if not self.trailer.has_key(key): 778 self.trailer[key] = value 779 if newTrailer.has_key("/Prev"): 780 startxref = newTrailer["/Prev"] 781 else: 782 break 783 elif x.isdigit(): 784 # PDF 1.5+ Cross-Reference Stream 785 stream.seek(-1, 1) 786 idnum, generation = self.readObjectHeader(stream) 787 xrefstream = readObject(stream, self) 788 assert xrefstream["/Type"] == "/XRef" 789 self.cacheIndirectObject(generation, idnum, xrefstream) 790 streamData = StringIO(xrefstream.getData()) 791 idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) 792 entrySizes = xrefstream.get("/W") 793 for num, size in self._pairs(idx_pairs): 794 cnt = 0 795 while cnt < size: 796 for i in range(len(entrySizes)): 797 d = streamData.read(entrySizes[i]) 798 di = convertToInt(d, entrySizes[i]) 799 if i == 0: 800 xref_type = di 801 elif i == 1: 802 if xref_type == 0: 803 next_free_object = di 804 elif xref_type == 1: 805 byte_offset = di 806 elif xref_type == 2: 807 objstr_num = di 808 elif i == 2: 809 if xref_type == 0: 810 next_generation = di 811 elif xref_type == 1: 812 generation = di 813 elif xref_type == 2: 814 obstr_idx = di 815 if xref_type == 0: 816 pass 817 elif xref_type == 1: 818 if not self.xref.has_key(generation): 819 self.xref[generation] = {} 820 if not num in self.xref[generation]: 821 self.xref[generation][num] = byte_offset 822 elif xref_type == 2: 823 if not num in self.xref_objStm: 824 self.xref_objStm[num] = [objstr_num, obstr_idx] 825 cnt += 1 826 num += 1 827 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" 828 for key in trailerKeys: 829 if xrefstream.has_key(key) and not self.trailer.has_key(key): 830 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 831 if xrefstream.has_key("/Prev"): 832 startxref = xrefstream["/Prev"] 833 else: 834 break 835 else: 836 # bad xref character at startxref. Let's see if we can find 837 # the xref table nearby, as we've observed this error with an 838 # off-by-one before. 839 stream.seek(-11, 1) 840 tmp = stream.read(20) 841 xref_loc = tmp.find("xref") 842 if xref_loc != -1: 843 startxref -= (10 - xref_loc) 844 continue 845 else: 846 # no xref table found at specified location 847 assert False 848 break 849 850 def _pairs(self, array): 851 i = 0 852 while True: 853 yield array[i], array[i+1] 854 i += 2 855 if (i+1) >= len(array): 856 break 857 858 def readNextEndLine(self, stream): 859 line = "" 860 while True: 861 x = stream.read(1) 862 stream.seek(-2, 1) 863 if x == '\n' or x == '\r': 864 while x == '\n' or x == '\r': 865 x = stream.read(1) 866 stream.seek(-2, 1) 867 stream.seek(1, 1) 868 break 869 else: 870 line = x + line 871 return line 872 873 ## 874 # When using an encrypted / secured PDF file with the PDF Standard 875 # encryption handler, this function will allow the file to be decrypted. 876 # It checks the given password against the document's user password and 877 # owner password, and then stores the resulting decryption key if either 878 # password is correct. 879 # <p> 880 # It does not matter which password was matched. Both passwords provide 881 # the correct decryption key that will allow the document to be used with 882 # this library. 883 # <p> 884 # Stability: Added in v1.8, will exist for all future v1.x releases. 885 # 886 # @return 0 if the password failed, 1 if the password matched the user 887 # password, and 2 if the password matched the owner password. 888 # 889 # @exception NotImplementedError Document uses an unsupported encryption 890 # method. 891 def decrypt(self, password): 892 self._override_encryption = True 893 try: 894 return self._decrypt(password) 895 finally: 896 self._override_encryption = False 897 898 def _decrypt(self, password): 899 encrypt = self.trailer['/Encrypt'].getObject() 900 if encrypt['/Filter'] != '/Standard': 901 raise NotImplementedError, "only Standard PDF encryption handler is available" 902 if not (encrypt['/V'] in (1, 2)): 903 raise NotImplementedError, "only algorithm code 1 and 2 are supported" 904 user_password, key = self._authenticateUserPassword(password) 905 if user_password: 906 self._decryption_key = key 907 return 1 908 else: 909 rev = encrypt['/R'].getObject() 910 if rev == 2: 911 keylen = 5 912 else: 913 keylen = encrypt['/Length'].getObject() / 8 914 key = _alg33_1(password, rev, keylen) 915 real_O = encrypt["/O"].getObject() 916 if rev == 2: 917 userpass = utils.RC4_encrypt(key, real_O) 918 else: 919 val = real_O 920 for i in range(19, -1, -1): 921 new_key = '' 922 for l in range(len(key)): 923 new_key += chr(ord(key[l]) ^ i) 924 val = utils.RC4_encrypt(new_key, val) 925 userpass = val 926 owner_password, key = self._authenticateUserPassword(userpass) 927 if owner_password: 928 self._decryption_key = key 929 return 2 930 return 0 931 932 def _authenticateUserPassword(self, password): 933 encrypt = self.trailer['/Encrypt'].getObject() 934 rev = encrypt['/R'].getObject() 935 owner_entry = encrypt['/O'].getObject().original_bytes 936 p_entry = encrypt['/P'].getObject() 937 id_entry = self.trailer['/ID'].getObject() 938 id1_entry = id_entry[0].getObject() 939 if rev == 2: 940 U, key = _alg34(password, owner_entry, p_entry, id1_entry) 941 elif rev >= 3: 942 U, key = _alg35(password, rev, 943 encrypt["/Length"].getObject() / 8, owner_entry, 944 p_entry, id1_entry, 945 encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) 946 real_U = encrypt['/U'].getObject().original_bytes 947 return U == real_U, key 948 949 def getIsEncrypted(self): 950 return self.trailer.has_key("/Encrypt") 951 952 ## 953 # Read-only boolean property showing whether this PDF file is encrypted. 954 # Note that this property, if true, will remain true even after the {@link 955 # #PdfFileReader.decrypt decrypt} function is called. 956 isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) 957 958 959def getRectangle(self, name, defaults): 960 retval = self.get(name) 961 if isinstance(retval, RectangleObject): 962 return retval 963 if retval == None: 964 for d in defaults: 965 retval = self.get(d) 966 if retval != None: 967 break 968 if isinstance(retval, IndirectObject): 969 retval = self.pdf.getObject(retval) 970 retval = RectangleObject(retval) 971 setRectangle(self, name, retval) 972 return retval 973 974def setRectangle(self, name, value): 975 if not isinstance(name, NameObject): 976 name = NameObject(name) 977 self[name] = value 978 979def deleteRectangle(self, name): 980 del self[name] 981 982def createRectangleAccessor(name, fallback): 983 return \ 984 property( 985 lambda self: getRectangle(self, name, fallback), 986 lambda self, value: setRectangle(self, name, value), 987 lambda self: deleteRectangle(self, name) 988 ) 989 990## 991# This class represents a single page within a PDF file. Typically this object 992# will be created by accessing the {@link #PdfFileReader.getPage getPage} 993# function of the {@link #PdfFileReader PdfFileReader} class, but it is 994# also possible to create an empty page with the createBlankPage static 995# method. 996# @param pdf PDF file the page belongs to (optional, defaults to None). 997class PageObject(DictionaryObject): 998 def __init__(self, pdf=None, indirectRef=None): 999 DictionaryObject.__init__(self) 1000 self.pdf = pdf 1001 # Stores the original indirect reference to this object in its source PDF 1002 self.indirectRef = indirectRef 1003 1004 ## 1005 # Returns a new blank page. 1006 # If width or height is None, try to get the page size from the 1007 # last page of pdf. If pdf is None or contains no page, a 1008 # PageSizeNotDefinedError is raised. 1009 # @param pdf PDF file the page belongs to 1010 # @param width The width of the new page expressed in default user 1011 # space units. 1012 # @param height The height of the new page expressed in default user 1013 # space units. 1014 def createBlankPage(pdf=None, width=None, height=None): 1015 page = PageObject(pdf) 1016 1017 # Creates a new page (cf PDF Reference 7.7.3.3) 1018 page.__setitem__(NameObject('/Type'), NameObject('/Page')) 1019 page.__setitem__(NameObject('/Parent'), NullObject()) 1020 page.__setitem__(NameObject('/Resources'), DictionaryObject()) 1021 if width is None or height is None: 1022 if pdf is not None and pdf.getNumPages() > 0: 1023 lastpage = pdf.getPage(pdf.getNumPages() - 1) 1024 width = lastpage.mediaBox.getWidth() 1025 height = lastpage.mediaBox.getHeight() 1026 else: 1027 raise utils.PageSizeNotDefinedError() 1028 page.__setitem__(NameObject('/MediaBox'), 1029 RectangleObject([0, 0, width, height])) 1030 1031 return page 1032 createBlankPage = staticmethod(createBlankPage) 1033 1034 ## 1035 # Rotates a page clockwise by increments of 90 degrees. 1036 # <p> 1037 # Stability: Added in v1.1, will exist for all future v1.x releases. 1038 # @param angle Angle to rotate the page. Must be an increment of 90 deg. 1039 def rotateClockwise(self, angle): 1040 assert angle % 90 == 0 1041 self._rotate(angle) 1042 return self 1043 1044 ## 1045 # Rotates a page counter-clockwise by increments of 90 degrees. 1046 # <p> 1047 # Stability: Added in v1.1, will exist for all future v1.x releases. 1048 # @param angle Angle to rotate the page. Must be an increment of 90 deg. 1049 def rotateCounterClockwise(self, angle): 1050 assert angle % 90 == 0 1051 self._rotate(-angle) 1052 return self 1053 1054 def _rotate(self, angle): 1055 currentAngle = self.get("/Rotate", 0) 1056 self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) 1057 1058 def _mergeResources(res1, res2, resource): 1059 newRes = DictionaryObject() 1060 newRes.update(res1.get(resource, DictionaryObject()).getObject()) 1061 page2Res = res2.get(resource, DictionaryObject()).getObject() 1062 renameRes = {} 1063 for key in page2Res.keys(): 1064 if newRes.has_key(key) and newRes[key] != page2Res[key]: 1065 newname = NameObject(key + "renamed") 1066 renameRes[key] = newname 1067 newRes[newname] = page2Res[key] 1068 elif not newRes.has_key(key): 1069 newRes[key] = page2Res.raw_get(key) 1070 return newRes, renameRes 1071 _mergeResources = staticmethod(_mergeResources) 1072 1073 def _contentStreamRename(stream, rename, pdf): 1074 if not rename: 1075 return stream 1076 stream = ContentStream(stream, pdf) 1077 for operands,operator in stream.operations: 1078 for i in range(len(operands)): 1079 op = operands[i] 1080 if isinstance(op, NameObject): 1081 operands[i] = rename.get(op, op) 1082 return stream 1083 _contentStreamRename = staticmethod(_contentStreamRename) 1084 1085 def _pushPopGS(contents, pdf): 1086 # adds a graphics state "push" and "pop" to the beginning and end 1087 # of a content stream. This isolates it from changes such as 1088 # transformation matricies. 1089 stream = ContentStream(contents, pdf) 1090 stream.operations.insert(0, [[], "q"]) 1091 stream.operations.append([[], "Q"]) 1092 return stream 1093 _pushPopGS = staticmethod(_pushPopGS) 1094 1095 def _addTransformationMatrix(contents, pdf, ctm): 1096 # adds transformation matrix at the beginning of the given 1097 # contents stream. 1098 a, b, c, d, e, f = ctm 1099 contents = ContentStream(contents, pdf) 1100 contents.operations.insert(0, [[FloatObject(a), FloatObject(b), 1101 FloatObject(c), FloatObject(d), FloatObject(e), 1102 FloatObject(f)], " cm"]) 1103 return contents 1104 _addTransformationMatrix = staticmethod(_addTransformationMatrix) 1105 1106 ## 1107 # Returns the /Contents object, or None if it doesn't exist. 1108 # /Contents is optionnal, as described in PDF Reference 7.7.3.3 1109 def getContents(self): 1110 if self.has_key("/Contents"): 1111 return self["/Contents"].getObject() 1112 else: 1113 return None 1114 1115 ## 1116 # Merges the content streams of two pages into one. Resource references 1117 # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc 1118 # of this page are not altered. The parameter page's content stream will 1119 # be added to the end of this page's content stream, meaning that it will 1120 # be drawn after, or "on top" of this page. 1121 # <p> 1122 # Stability: Added in v1.4, will exist for all future 1.x releases. 1123 # @param page2 An instance of {@link #PageObject PageObject} to be merged 1124 # into this one. 1125 def mergePage(self, page2): 1126 self._mergePage(page2) 1127 1128 ## 1129 # Actually merges the content streams of two pages into one. Resource 1130 # references (i.e. fonts) are maintained from both pages. The 1131 # mediabox/cropbox/etc of this page are not altered. The parameter page's 1132 # content stream will be added to the end of this page's content stream, 1133 # meaning that it will be drawn after, or "on top" of this page. 1134 # 1135 # @param page2 An instance of {@link #PageObject PageObject} to be merged 1136 # into this one. 1137 # @param page2transformation A fuction which applies a transformation to 1138 # the content stream of page2. Takes: page2 1139 # contents stream. Must return: new contents 1140 # stream. If omitted, the content stream will 1141 # not be modified. 1142 def _mergePage(self, page2, page2transformation=None): 1143 # First we work on merging the resource dictionaries. This allows us 1144 # to find out what symbols in the content streams we might need to 1145 # rename. 1146 1147 newResources = DictionaryObject() 1148 rename = {} 1149 originalResources = self["/Resources"].getObject() 1150 page2Resources = page2["/Resources"].getObject() 1151 1152 for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": 1153 new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) 1154 if new: 1155 newResources[NameObject(res)] = new 1156 rename.update(newrename) 1157 1158 # Combine /ProcSet sets. 1159 newResources[NameObject("/ProcSet")] = ArrayObject( 1160 frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( 1161 frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) 1162 ) 1163 ) 1164 1165 newContentArray = ArrayObject() 1166 1167 originalContent = self.getContents() 1168 if originalContent is not None: 1169 newContentArray.append(PageObject._pushPopGS( 1170 originalContent, self.pdf)) 1171 1172 page2Content = page2.getContents() 1173 if page2Content is not None: 1174 if page2transformation is not None: 1175 page2Content = page2transformation(page2Content) 1176 page2Content = PageObject._contentStreamRename( 1177 page2Content, rename, self.pdf) 1178 page2Content = PageObject._pushPopGS(page2Content, self.pdf) 1179 newContentArray.append(page2Content) 1180 1181 self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) 1182 self[NameObject('/Resources')] = newResources 1183 1184 ## 1185 # This is similar to mergePage, but a transformation matrix is 1186 # applied to the merged stream. 1187 # 1188 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1189 # @param ctm A 6 elements tuple containing the operands of the 1190 # transformation matrix 1191 def mergeTransformedPage(self, page2, ctm): 1192 self._mergePage(page2, lambda page2Content: 1193 PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm)) 1194 1195 ## 1196 # This is similar to mergePage, but the stream to be merged is scaled 1197 # by appling a transformation matrix. 1198 # 1199 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1200 # @param factor The scaling factor 1201 def mergeScaledPage(self, page2, factor): 1202 # CTM to scale : [ sx 0 0 sy 0 0 ] 1203 return self.mergeTransformedPage(page2, [factor, 0, 1204 0, factor, 1205 0, 0]) 1206 1207 ## 1208 # This is similar to mergePage, but the stream to be merged is rotated 1209 # by appling a transformation matrix. 1210 # 1211 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1212 # @param rotation The angle of the rotation, in degrees 1213 def mergeRotatedPage(self, page2, rotation): 1214 rotation = math.radians(rotation) 1215 return self.mergeTransformedPage(page2, 1216 [math.cos(rotation), math.sin(rotation), 1217 -math.sin(rotation), math.cos(rotation), 1218 0, 0]) 1219 1220 ## 1221 # This is similar to mergePage, but the stream to be merged is translated 1222 # by appling a transformation matrix. 1223 # 1224 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1225 # @param tx The translation on X axis 1226 # @param tx The translation on Y axis 1227 def mergeTranslatedPage(self, page2, tx, ty): 1228 return self.mergeTransformedPage(page2, [1, 0, 1229 0, 1, 1230 tx, ty]) 1231 1232 ## 1233 # This is similar to mergePage, but the stream to be merged is rotated 1234 # and scaled by appling a transformation matrix. 1235 # 1236 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1237 # @param rotation The angle of the rotation, in degrees 1238 # @param factor The scaling factor 1239 def mergeRotatedScaledPage(self, page2, rotation, scale): 1240 rotation = math.radians(rotation) 1241 rotating = [[math.cos(rotation), math.sin(rotation),0], 1242 [-math.sin(rotation),math.cos(rotation), 0], 1243 [0, 0, 1]] 1244 scaling = [[scale,0, 0], 1245 [0, scale,0], 1246 [0, 0, 1]] 1247 ctm = utils.matrixMultiply(rotating, scaling) 1248 1249 return self.mergeTransformedPage(page2, 1250 [ctm[0][0], ctm[0][1], 1251 ctm[1][0], ctm[1][1], 1252 ctm[2][0], ctm[2][1]]) 1253 1254 ## 1255 # This is similar to mergePage, but the stream to be merged is translated 1256 # and scaled by appling a transformation matrix. 1257 # 1258 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1259 # @param scale The scaling factor 1260 # @param tx The translation on X axis 1261 # @param tx The translation on Y axis 1262 def mergeScaledTranslatedPage(self, page2, scale, tx, ty): 1263 translation = [[1, 0, 0], 1264 [0, 1, 0], 1265 [tx,ty,1]] 1266 scaling = [[scale,0, 0], 1267 [0, scale,0], 1268 [0, 0, 1]] 1269 ctm = utils.matrixMultiply(scaling, translation) 1270 1271 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 1272 ctm[1][0], ctm[1][1], 1273 ctm[2][0], ctm[2][1]]) 1274 1275 ## 1276 # This is similar to mergePage, but the stream to be merged is translated, 1277 # rotated and scaled by appling a transformation matrix. 1278 # 1279 # @param page2 An instance of {@link #PageObject PageObject} to be merged. 1280 # @param tx The translation on X axis 1281 # @param ty The translation on Y axis 1282 # @param rotation The angle of the rotation, in degrees 1283 # @param scale The scaling factor 1284 def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty): 1285 translation = [[1, 0, 0], 1286 [0, 1, 0], 1287 [tx,ty,1]] 1288 rotation = math.radians(rotation) 1289 rotating = [[math.cos(rotation), math.sin(rotation),0], 1290 [-math.sin(rotation),math.cos(rotation), 0], 1291 [0, 0, 1]] 1292 scaling = [[scale,0, 0], 1293 [0, scale,0], 1294 [0, 0, 1]] 1295 ctm = utils.matrixMultiply(rotating, scaling) 1296 ctm = utils.matrixMultiply(ctm, translation) 1297 1298 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 1299 ctm[1][0], ctm[1][1], 1300 ctm[2][0], ctm[2][1]]) 1301 1302 ## 1303 # Applys a transformation matrix the page. 1304 # 1305 # @param ctm A 6 elements tuple containing the operands of the 1306 # transformation matrix 1307 def addTransformation(self, ctm): 1308 originalContent = self.getContents() 1309 if originalContent is not None: 1310 newContent = PageObject._addTransformationMatrix( 1311 originalContent, self.pdf, ctm) 1312 newContent = PageObject._pushPopGS(newContent, self.pdf) 1313 self[NameObject('/Contents')] = newContent 1314 1315 ## 1316 # Scales a page by the given factors by appling a transformation 1317 # matrix to its content and updating the page size. 1318 # 1319 # @param sx The scaling factor on horizontal axis 1320 # @param sy The scaling factor on vertical axis 1321 def scale(self, sx, sy): 1322 self.addTransformation([sx, 0, 1323 0, sy, 1324 0, 0]) 1325 self.mediaBox = RectangleObject([ 1326 float(self.mediaBox.getLowerLeft_x()) * sx, 1327 float(self.mediaBox.getLowerLeft_y()) * sy, 1328 float(self.mediaBox.getUpperRight_x()) * sx, 1329 float(self.mediaBox.getUpperRight_y()) * sy]) 1330 1331 ## 1332 # Scales a page by the given factor by appling a transformation 1333 # matrix to its content and updating the page size. 1334 # 1335 # @param factor The scaling factor 1336 def scaleBy(self, factor): 1337 self.scale(factor, factor) 1338 1339 ## 1340 # Scales a page to the specified dimentions by appling a 1341 # transformation matrix to its content and updating the page size. 1342 # 1343 # @param width The new width 1344 # @param height The new heigth 1345 def scaleTo(self, width, height): 1346 sx = width / (self.mediaBox.getUpperRight_x() - 1347 self.mediaBox.getLowerLeft_x ()) 1348 sy = height / (self.mediaBox.getUpperRight_y() - 1349 self.mediaBox.getLowerLeft_x ()) 1350 self.scale(sx, sy) 1351 1352 ## 1353 # Compresses the size of this page by joining all content streams and 1354 # applying a FlateDecode filter. 1355 # <p> 1356 # Stability: Added in v1.6, will exist for all future v1.x releases. 1357 # However, it is possible that this function will perform no action if 1358 # content stream compression becomes "automatic" for some reason. 1359 def compressContentStreams(self): 1360 content = self.getContents() 1361 if content is not None: 1362 if not isinstance(content, ContentStream): 1363 content = ContentStream(content, self.pdf) 1364 self[NameObject("/Contents")] = content.flateEncode() 1365 1366 ## 1367 # Locate all text drawing commands, in the order they are provided in the 1368 # content stream, and extract the text. This works well for some PDF 1369 # files, but poorly for others, depending on the generator used. This will 1370 # be refined in the future. Do not rely on the order of text coming out of 1371 # this function, as it will change if this function is made more 1372 # sophisticated. 1373 # <p> 1374 # Stability: Added in v1.7, will exist for all future v1.x releases. May 1375 # be overhauled to provide more ordered text in the future. 1376 # @return a unicode string object 1377 def extractText(self): 1378 text = u"" 1379 content = self["/Contents"].getObject() 1380 if not isinstance(content, ContentStream): 1381 content = ContentStream(content, self.pdf) 1382 # Note: we check all strings are TextStringObjects. ByteStringObjects 1383 # are strings where the byte->string encoding was unknown, so adding 1384 # them to the text here would be gibberish. 1385 for operands,operator in content.operations: 1386 if operator == "Tj": 1387 _text = operands[0] 1388 if isinstance(_text, TextStringObject): 1389 text += _text 1390 elif operator == "T*": 1391 text += "\n" 1392 elif operator == "'": 1393 text += "\n" 1394 _text = operands[0] 1395 if isinstance(_text, TextStringObject): 1396 text += operands[0] 1397 elif operator == '"': 1398 _text = operands[2] 1399 if isinstance(_text, TextStringObject): 1400 text += "\n" 1401 text += _text 1402 elif operator == "TJ": 1403 for i in operands[0]: 1404 if isinstance(i, TextStringObject): 1405 text += i 1406 return text 1407 1408 ## 1409 # A rectangle (RectangleObject), expressed in default user space units, 1410 # defining the boundaries of the physical medium on which the page is 1411 # intended to be displayed or printed. 1412 # <p> 1413 # Stability: Added in v1.4, will exist for all future v1.x releases. 1414 mediaBox = createRectangleAccessor("/MediaBox", ()) 1415 1416 ## 1417 # A rectangle (RectangleObject), expressed in default user space units, 1418 # defining the visible region of default user space. When the page is 1419 # displayed or printed, its contents are to be clipped (cropped) to this 1420 # rectangle and then imposed on the output medium in some 1421 # implementation-defined manner. Default value: same as MediaBox. 1422 # <p> 1423 # Stability: Added in v1.4, will exist for all future v1.x releases. 1424 cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) 1425 1426 ## 1427 # A rectangle (RectangleObject), expressed in default user space units, 1428 # defining the region to which the contents of the page should be clipped 1429 # when output in a production enviroment. 1430 # <p> 1431 # Stability: Added in v1.4, will exist for all future v1.x releases. 1432 bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) 1433 1434 ## 1435 # A rectangle (RectangleObject), expressed in default user space units, 1436 # defining the intended dimensions of the finished page after trimming. 1437 # <p> 1438 # Stability: Added in v1.4, will exist for all future v1.x releases. 1439 trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) 1440 1441 ## 1442 # A rectangle (RectangleObject), expressed in default user space units, 1443 # defining the extent of the page's meaningful content as intended by the 1444 # page's creator. 1445 # <p> 1446 # Stability: Added in v1.4, will exist for all future v1.x releases. 1447 artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) 1448 1449 1450class ContentStream(DecodedStreamObject): 1451 def __init__(self, stream, pdf): 1452 self.pdf = pdf 1453 self.operations = [] 1454 # stream may be a StreamObject or an ArrayObject containing 1455 # multiple StreamObjects to be cat'd together. 1456 stream = stream.getObject() 1457 if isinstance(stream, ArrayObject): 1458 data = "" 1459 for s in stream: 1460 data += s.getObject().getData() 1461 stream = StringIO(data) 1462 else: 1463 stream = StringIO(stream.getData()) 1464 self.__parseContentStream(stream) 1465 1466 def __parseContentStream(self, stream): 1467 # file("f:\\tmp.txt", "w").write(stream.read()) 1468 stream.seek(0, 0) 1469 operands = [] 1470 while True: 1471 peek = readNonWhitespace(stream) 1472 if peek == '': 1473 break 1474 stream.seek(-1, 1) 1475 if peek.isalpha() or peek == "'" or peek == '"': 1476 operator = "" 1477 while True: 1478 tok = stream.read(1) 1479 if tok.isspace() or tok in NameObject.delimiterCharacters: 1480 stream.seek(-1, 1) 1481 break 1482 elif tok == '': 1483 break 1484 operator += tok 1485 if operator == "BI": 1486 # begin inline image - a completely different parsing 1487 # mechanism is required, of course... thanks buddy... 1488 assert operands == [] 1489 ii = self._readInlineImage(stream) 1490 self.operations.append((ii, "INLINE IMAGE")) 1491 else: 1492 self.operations.append((operands, operator)) 1493 operands = [] 1494 elif peek == '%': 1495 # If we encounter a comment in the content stream, we have to 1496 # handle it here. Typically, readObject will handle 1497 # encountering a comment -- but readObject assumes that 1498 # following the comment must be the object we're trying to 1499 # read. In this case, it could be an operator instead. 1500 while peek not in ('\r', '\n'): 1501 peek = stream.read(1) 1502 else: 1503 operands.append(readObject(stream, None)) 1504 1505 def _readInlineImage(self, stream): 1506 # begin reading just after the "BI" - begin image 1507 # first read the dictionary of settings. 1508 settings = DictionaryObject() 1509 while True: 1510 tok = readNonWhitespace(stream) 1511 stream.seek(-1, 1) 1512 if tok == "I": 1513 # "ID" - begin of image data 1514 break 1515 key = readObject(stream, self.pdf) 1516 tok = readNonWhitespace(stream) 1517 stream.seek(-1, 1) 1518 value = readObject(stream, self.pdf) 1519 settings[key] = value 1520 # left at beginning of ID 1521 tmp = stream.read(3) 1522 assert tmp[:2] == "ID" 1523 data = "" 1524 while True: 1525 tok = stream.read(1) 1526 if tok == "E": 1527 next = stream.read(1) 1528 if next == "I": 1529 break 1530 else: 1531 stream.seek(-1, 1) 1532 data += tok 1533 else: 1534 data += tok 1535 x = readNonWhitespace(stream) 1536 stream.seek(-1, 1) 1537 return {"settings": settings, "data": data} 1538 1539 def _getData(self): 1540 newdata = StringIO() 1541 for operands,operator in self.operations: 1542 if operator == "INLINE IMAGE": 1543 newdata.write("BI") 1544 dicttext = StringIO() 1545 operands["settings"].writeToStream(dicttext, None) 1546 newdata.write(dicttext.getvalue()[2:-2]) 1547 newdata.write("ID ") 1548 newdata.write(operands["data"]) 1549 newdata.write("EI") 1550 else: 1551 for op in operands: 1552 op.writeToStream(newdata, None) 1553 newdata.write(" ") 1554 newdata.write(operator) 1555 newdata.write("\n") 1556 return newdata.getvalue() 1557 1558 def _setData(self, value): 1559 self.__parseContentStream(StringIO(value)) 1560 1561 _data = property(_getData, _setData) 1562 1563 1564## 1565# A class representing the basic document metadata provided in a PDF File. 1566# <p> 1567# As of pyPdf v1.10, all text properties of the document metadata have two 1568# properties, eg. author and author_raw. The non-raw property will always 1569# return a TextStringObject, making it ideal for a case where the metadata is 1570# being displayed. The raw property can sometimes return a ByteStringObject, 1571# if pyPdf was unable to decode the string's text encoding; this requires 1572# additional safety in the caller and therefore is not as commonly accessed. 1573class DocumentInformation(DictionaryObject): 1574 def __init__(self): 1575 DictionaryObject.__init__(self) 1576 1577 def getText(self, key): 1578 retval = self.get(key, None) 1579 if isinstance(retval, TextStringObject): 1580 return retval 1581 return None 1582 1583 ## 1584 # Read-only property accessing the document's title. Added in v1.6, will 1585 # exist for all future v1.x releases. Modified in v1.10 to always return a 1586 # unicode string (TextStringObject). 1587 # @return A unicode string, or None if the title is not provided. 1588 title = property(lambda self: self.getText("/Title")) 1589 title_raw = property(lambda self: self.get("/Title")) 1590 1591 ## 1592 # Read-only property accessing the document's author. Added in v1.6, will 1593 # exist for all future v1.x releases. Modified in v1.10 to always return a 1594 # unicode string (TextStringObject). 1595 # @return A unicode string, or None if the author is not provided. 1596 author = property(lambda self: self.getText("/Author")) 1597 author_raw = property(lambda self: self.get("/Author")) 1598 1599 ## 1600 # Read-only property accessing the subject of the document. Added in v1.6, 1601 # will exist for all future v1.x releases. Modified in v1.10 to always 1602 # return a unicode string (TextStringObject). 1603 # @return A unicode string, or None if the subject is not provided. 1604 subject = property(lambda self: self.getText("/Subject")) 1605 subject_raw = property(lambda self: self.get("/Subject")) 1606 1607 ## 1608 # Read-only property accessing the document's creator. If the document was 1609 # converted to PDF from another format, the name of the application (for 1610 # example, OpenOffice) that created the original document from which it was 1611 # converted. Added in v1.6, will exist for all future v1.x releases. 1612 # Modified in v1.10 to always return a unicode string (TextStringObject). 1613 # @return A unicode string, or None if the creator is not provided. 1614 creator = property(lambda self: self.getText("/Creator")) 1615 creator_raw = property(lambda self: self.get("/Creator")) 1616 1617 ## 1618 # Read-only property accessing the document's producer. If the document 1619 # was converted to PDF from another format, the name of the application 1620 # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will 1621 # exist for all future v1.x releases. Modified in v1.10 to always return a 1622 # unicode string (TextStringObject). 1623 # @return A unicode string, or None if the producer is not provided. 1624 producer = property(lambda self: self.getText("/Producer")) 1625 producer_raw = property(lambda self: self.get("/Producer")) 1626 1627 1628## 1629# A class representing a destination within a PDF file. 1630# See section 8.2.1 of the PDF 1.6 reference. 1631# Stability: Added in v1.10, will exist for all v1.x releases. 1632class Destination(DictionaryObject): 1633 def __init__(self, title, page, typ, *args): 1634 DictionaryObject.__init__(self) 1635 self[NameObject("/Title")] = title 1636 self[NameObject("/Page")] = page 1637 self[NameObject("/Type")] = typ 1638 1639 # from table 8.2 of the PDF 1.6 reference. 1640 if typ == "/XYZ": 1641 (self[NameObject("/Left")], self[NameObject("/Top")], 1642 self[NameObject("/Zoom")]) = args 1643 elif typ == "/FitR": 1644 (self[NameObject("/Left")], self[NameObject("/Bottom")], 1645 self[NameObject("/Right")], self[NameObject("/Top")]) = args 1646 elif typ in ["/FitH", "FitBH"]: 1647 self[NameObject("/Top")], = args 1648 elif typ in ["/FitV", "FitBV"]: 1649 self[NameObject("/Left")], = args 1650 elif typ in ["/Fit", "FitB"]: 1651 pass 1652 else: 1653 raise utils.PdfReadError("Unknown Destination Type: %r" % typ) 1654 1655 ## 1656 # Read-only property accessing the destination title. 1657 # @return A string. 1658 title = property(lambda self: self.get("/Title")) 1659 1660 ## 1661 # Read-only property accessing the destination page. 1662 # @return An integer. 1663 page = property(lambda self: self.get("/Page")) 1664 1665 ## 1666 # Read-only property accessing the destination type. 1667 # @return A string. 1668 typ = property(lambda self: self.get("/Type")) 1669 1670 ## 1671 # Read-only property accessing the zoom factor. 1672 # @return A number, or None if not available. 1673 zoom = property(lambda self: self.get("/Zoom", None)) 1674 1675 ## 1676 # Read-only property accessing the left horizontal coordinate. 1677 # @return A number, or None if not available. 1678 left = property(lambda self: self.get("/Left", None)) 1679 1680 ## 1681 # Read-only property accessing the right horizontal coordinate. 1682 # @return A number, or None if not available. 1683 right = property(lambda self: self.get("/Right", None)) 1684 1685 ## 1686 # Read-only property accessing the top vertical coordinate. 1687 # @return A number, or None if not available. 1688 top = property(lambda self: self.get("/Top", None)) 1689 1690 ## 1691 # Read-only property accessing the bottom vertical coordinate. 1692 # @return A number, or None if not available. 1693 bottom = property(lambda self: self.get("/Bottom", None)) 1694 1695def convertToInt(d, size): 1696 if size > 8: 1697 raise utils.PdfReadError("invalid size in convertToInt") 1698 d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d 1699 d = d[-8:] 1700 return struct.unpack(">q", d)[0] 1701 1702# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 1703_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ 1704 '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ 1705 '\xa9\xfe\x64\x53\x69\x7a' 1706 1707# Implementation of algorithm 3.2 of the PDF standard security handler, 1708# section 3.5.2 of the PDF 1.6 reference. 1709def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): 1710 # 1. Pad or truncate the password string to exactly 32 bytes. If the 1711 # password string is more than 32 bytes long, use only its first 32 bytes; 1712 # if it is less than 32 bytes long, pad it by appending the required number 1713 # of additional bytes from the beginning of the padding string 1714 # (_encryption_padding). 1715 password = (password + _encryption_padding)[:32] 1716 # 2. Initialize the MD5 hash function and pass the result of step 1 as 1717 # input to this function. 1718 import struct 1719 m = md5(password) 1720 # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash 1721 # function. 1722 m.update(owner_entry) 1723 # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass 1724 # these bytes to the MD5 hash function, low-order byte first. 1725 p_entry = struct.pack('<i', p_entry) 1726 m.update(p_entry) 1727 # 5. Pass the first element of the file's file identifier array to the MD5 1728 # hash function. 1729 m.update(id1_entry) 1730 # 6. (Revision 3 or greater) If document metadata is not being encrypted, 1731 # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. 1732 if rev >= 3 and not metadata_encrypt: 1733 m.update("\xff\xff\xff\xff") 1734 # 7. Finish the hash. 1735 md5_hash = m.digest() 1736 # 8. (Revision 3 or greater) Do the following 50 times: Take the output 1737 # from the previous MD5 hash and pass the first n bytes of the output as 1738 # input into a new MD5 hash, where n is the number of bytes of the 1739 # encryption key as defined by the value of the encryption dictionary's 1740 # /Length entry. 1741 if rev >= 3: 1742 for i in range(50): 1743 md5_hash = md5(md5_hash[:keylen]).digest() 1744 # 9. Set the encryption key to the first n bytes of the output from the 1745 # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or 1746 # greater, depends on the value of the encryption dictionary's /Length 1747 # entry. 1748 return md5_hash[:keylen] 1749 1750# Implementation of algorithm 3.3 of the PDF standard security handler, 1751# section 3.5.2 of the PDF 1.6 reference. 1752def _alg33(owner_pwd, user_pwd, rev, keylen): 1753 # steps 1 - 4 1754 key = _alg33_1(owner_pwd, rev, keylen) 1755 # 5. Pad or truncate the user password string as described in step 1 of 1756 # algorithm 3.2. 1757 user_pwd = (user_pwd + _encryption_padding)[:32] 1758 # 6. Encrypt the result of step 5, using an RC4 encryption function with 1759 # the encryption key obtained in step 4. 1760 val = utils.RC4_encrypt(key, user_pwd) 1761 # 7. (Revision 3 or greater) Do the following 19 times: Take the output 1762 # from the previous invocation of the RC4 function and pass it as input to 1763 # a new invocation of the function; use an encryption key generated by 1764 # taking each byte of the encryption key obtained in step 4 and performing 1765 # an XOR operation between that byte and the single-byte value of the 1766 # iteration counter (from 1 to 19). 1767 if rev >= 3: 1768 for i in range(1, 20): 1769 new_key = '' 1770 for l in range(len(key)): 1771 new_key += chr(ord(key[l]) ^ i) 1772 val = utils.RC4_encrypt(new_key, val) 1773 # 8. Store the output from the final invocation of the RC4 as the value of 1774 # the /O entry in the encryption dictionary. 1775 return val 1776 1777# Steps 1-4 of algorithm 3.3 1778def _alg33_1(password, rev, keylen): 1779 # 1. Pad or truncate the owner password string as described in step 1 of 1780 # algorithm 3.2. If there is no owner password, use the user password 1781 # instead. 1782 password = (password + _encryption_padding)[:32] 1783 # 2. Initialize the MD5 hash function and pass the result of step 1 as 1784 # input to this function. 1785 m = md5(password) 1786 # 3. (Revision 3 or greater) Do the following 50 times: Take the output 1787 # from the previous MD5 hash and pass it as input into a new MD5 hash. 1788 md5_hash = m.digest() 1789 if rev >= 3: 1790 for i in range(50): 1791 md5_hash = md5(md5_hash).digest() 1792 # 4. Create an RC4 encryption key using the first n bytes of the output 1793 # from the final MD5 hash, where n is always 5 for revision 2 but, for 1794 # revision 3 or greater, depends on the value of the encryption 1795 # dictionary's /Length entry. 1796 key = md5_hash[:keylen] 1797 return key 1798 1799# Implementation of algorithm 3.4 of the PDF standard security handler, 1800# section 3.5.2 of the PDF 1.6 reference. 1801def _alg34(password, owner_entry, p_entry, id1_entry): 1802 # 1. Create an encryption key based on the user password string, as 1803 # described in algorithm 3.2. 1804 key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) 1805 # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, 1806 # using an RC4 encryption function with the encryption key from the 1807 # preceding step. 1808 U = utils.RC4_encrypt(key, _encryption_padding) 1809 # 3. Store the result of step 2 as the value of the /U entry in the 1810 # encryption dictionary. 1811 return U, key 1812 1813# Implementation of algorithm 3.4 of the PDF standard security handler, 1814# section 3.5.2 of the PDF 1.6 reference. 1815def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): 1816 # 1. Create an encryption key based on the user password string, as 1817 # described in Algorithm 3.2. 1818 key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) 1819 # 2. Initialize the MD5 hash function and pass the 32-byte padding string 1820 # shown in step 1 of Algorithm 3.2 as input to this function. 1821 m = md5() 1822 m.update(_encryption_padding) 1823 # 3. Pass the first element of the file's file identifier array (the value 1824 # of the ID entry in the document's trailer dictionary; see Table 3.13 on 1825 # page 73) to the hash function and finish the hash. (See implementation 1826 # note 25 in Appendix H.) 1827 m.update(id1_entry) 1828 md5_hash = m.digest() 1829 # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption 1830 # function with the encryption key from step 1. 1831 val = utils.RC4_encrypt(key, md5_hash) 1832 # 5. Do the following 19 times: Take the output from the previous 1833 # invocation of the RC4 function and pass it as input to a new invocation 1834 # of the function; use an encryption key generated by taking each byte of 1835 # the original encryption key (obtained in step 2) and performing an XOR 1836 # operation between that byte and the single-byte value of the iteration 1837 # counter (from 1 to 19). 1838 for i in range(1, 20): 1839 new_key = '' 1840 for l in range(len(key)): 1841 new_key += chr(ord(key[l]) ^ i) 1842 val = utils.RC4_encrypt(new_key, val) 1843 # 6. Append 16 bytes of arbitrary padding to the output from the final 1844 # invocation of the RC4 function and store the 32-byte result as the value 1845 # of the U entry in the encryption dictionary. 1846 # (implementator note: I don't know what "arbitrary padding" is supposed to 1847 # mean, so I have used null bytes. This seems to match a few other 1848 # people's implementations) 1849 return val + ('\x00' * 16), key 1850 1851#if __name__ == "__main__": 1852# output = PdfFileWriter() 1853# 1854# input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) 1855# page1 = input1.getPage(0) 1856# 1857# input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) 1858# page2 = input2.getPage(0) 1859# page3 = input2.getPage(1) 1860# page1.mergePage(page2) 1861# page1.mergePage(page3) 1862# 1863# input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) 1864# page1.mergePage(input3.getPage(0)) 1865# 1866# page1.compressContentStreams() 1867# 1868# output.addPage(page1) 1869# output.write(file("test\\merge-test.pdf", "wb")) 1870 1871 1872