1# -*- coding: utf-8 -*-
2#
3# vim: sw=4:expandtab:foldmethod=marker
4#
5# Copyright (c) 2006, Mathieu Fenniak
6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
7#
8# All rights reserved.
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright notice,
15# this list of conditions and the following disclaimer.
16# * Redistributions in binary form must reproduce the above copyright notice,
17# this list of conditions and the following disclaimer in the documentation
18# and/or other materials provided with the distribution.
19# * The name of the author may not be used to endorse or promote products
20# derived from this software without specific prior written permission.
21#
22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32# POSSIBILITY OF SUCH DAMAGE.
33
34
35"""
36A pure-Python PDF library with very minimal capabilities.  It was designed to
37be able to split and merge PDF files by page, and that's about all it can do.
38It may be a solid base for future PDF file work in Python.
39"""
40__author__ = "Mathieu Fenniak"
41__author_email__ = "biziqe@mathieu.fenniak.net"
42
43import math
44import struct
45from sys import version_info
46try:
47    from cStringIO import StringIO
48except ImportError:
49    from StringIO import StringIO
50
51import filters
52import utils
53import warnings
54from generic import *
55from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
56
57if version_info < ( 2, 4 ):
58   from sets import ImmutableSet as frozenset
59
60if version_info < ( 2, 5 ):
61    from md5 import md5
62else:
63    from hashlib import md5
64
65##
66# This class supports writing PDF files out, given pages produced by another
67# class (typically {@link #PdfFileReader PdfFileReader}).
68class PdfFileWriter(object):
69    def __init__(self):
70        self._header = "%PDF-1.3"
71        self._objects = []  # array of indirect objects
72
73        # The root of our page tree node.
74        pages = DictionaryObject()
75        pages.update({
76                NameObject("/Type"): NameObject("/Pages"),
77                NameObject("/Count"): NumberObject(0),
78                NameObject("/Kids"): ArrayObject(),
79                })
80        self._pages = self._addObject(pages)
81
82        # info object
83        info = DictionaryObject()
84        info.update({
85                NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/")
86                })
87        self._info = self._addObject(info)
88
89        # root object
90        root = DictionaryObject()
91        root.update({
92            NameObject("/Type"): NameObject("/Catalog"),
93            NameObject("/Pages"): self._pages,
94            })
95        self._root = self._addObject(root)
96
97    def _addObject(self, obj):
98        self._objects.append(obj)
99        return IndirectObject(len(self._objects), 0, self)
100
101    def getObject(self, ido):
102        if ido.pdf != self:
103            raise ValueError("pdf must be self")
104        return self._objects[ido.idnum - 1]
105
106    ##
107    # Common method for inserting or adding a page to this PDF file.
108    #
109    # @param page The page to add to the document.  This argument should be
110    #             an instance of {@link #PageObject PageObject}.
111    # @param action The function which will insert the page in the dictionnary.
112    #               Takes: page list, page to add.
113    def _addPage(self, page, action):
114        assert page["/Type"] == "/Page"
115        page[NameObject("/Parent")] = self._pages
116        page = self._addObject(page)
117        pages = self.getObject(self._pages)
118        action(pages["/Kids"], page)
119        pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
120
121    ##
122    # Adds a page to this PDF file.  The page is usually acquired from a
123    # {@link #PdfFileReader PdfFileReader} instance.
124    # <p>
125    # Stability: Added in v1.0, will exist for all v1.x releases.
126    #
127    # @param page The page to add to the document.  This argument should be
128    #             an instance of {@link #PageObject PageObject}.
129    def addPage(self, page):
130        self._addPage(page, list.append)
131
132    ##
133    # Insert a page in this PDF file.  The page is usually acquired from a
134    # {@link #PdfFileReader PdfFileReader} instance.
135    #
136    # @param page The page to add to the document.  This argument should be
137    #             an instance of {@link #PageObject PageObject}.
138    # @param index Position at which the page will be inserted.
139    def insertPage(self, page, index=0):
140        self._addPage(page, lambda l, p: l.insert(index, p))
141
142    ##
143    # Retrieves a page by number from this PDF file.
144    # @return Returns a {@link #PageObject PageObject} instance.
145    def getPage(self, pageNumber):
146        pages = self.getObject(self._pages)
147        # XXX: crude hack
148        return pages["/Kids"][pageNumber].getObject()
149
150    ##
151    # Return the number of pages.
152    # @return The number of pages.
153    def getNumPages(self):
154        pages = self.getObject(self._pages)
155        return int(pages[NameObject("/Count")])
156
157    ##
158    # Append a blank page to this PDF file and returns it. If no page size
159    # is specified, use the size of the last page; throw
160    # PageSizeNotDefinedError if it doesn't exist.
161    # @param width The width of the new page expressed in default user
162    # space units.
163    # @param height The height of the new page expressed in default user
164    # space units.
165    def addBlankPage(self, width=None, height=None):
166        page = PageObject.createBlankPage(self, width, height)
167        self.addPage(page)
168        return page
169
170    ##
171    # Insert a blank page to this PDF file and returns it. If no page size
172    # is specified, use the size of the page in the given index; throw
173    # PageSizeNotDefinedError if it doesn't exist.
174    # @param width  The width of the new page expressed in default user
175    #               space units.
176    # @param height The height of the new page expressed in default user
177    #               space units.
178    # @param index  Position to add the page.
179    def insertBlankPage(self, width=None, height=None, index=0):
180        if width is None or height is None and \
181                (self.getNumPages() - 1) >= index:
182            oldpage = self.getPage(index)
183            width = oldpage.mediaBox.getWidth()
184            height = oldpage.mediaBox.getHeight()
185        page = PageObject.createBlankPage(self, width, height)
186        self.insertPage(page, index)
187        return page
188
189    ##
190    # Encrypt this PDF file with the PDF Standard encryption handler.
191    # @param user_pwd The "user password", which allows for opening and reading
192    # the PDF file with the restrictions provided.
193    # @param owner_pwd The "owner password", which allows for opening the PDF
194    # files without any restrictions.  By default, the owner password is the
195    # same as the user password.
196    # @param use_128bit Boolean argument as to whether to use 128bit
197    # encryption.  When false, 40bit encryption will be used.  By default, this
198    # flag is on.
199    def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
200        import time, random
201        if owner_pwd == None:
202            owner_pwd = user_pwd
203        if use_128bit:
204            V = 2
205            rev = 3
206            keylen = 128 / 8
207        else:
208            V = 1
209            rev = 2
210            keylen = 40 / 8
211        # permit everything:
212        P = -1
213        O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
214        ID_1 = md5(repr(time.time())).digest()
215        ID_2 = md5(repr(random.random())).digest()
216        self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2)))
217        if rev == 2:
218            U, key = _alg34(user_pwd, O, P, ID_1)
219        else:
220            assert rev == 3
221            U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
222        encrypt = DictionaryObject()
223        encrypt[NameObject("/Filter")] = NameObject("/Standard")
224        encrypt[NameObject("/V")] = NumberObject(V)
225        if V == 2:
226            encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
227        encrypt[NameObject("/R")] = NumberObject(rev)
228        encrypt[NameObject("/O")] = ByteStringObject(O)
229        encrypt[NameObject("/U")] = ByteStringObject(U)
230        encrypt[NameObject("/P")] = NumberObject(P)
231        self._encrypt = self._addObject(encrypt)
232        self._encrypt_key = key
233
234    ##
235    # Writes the collection of pages added to this object out as a PDF file.
236    # <p>
237    # Stability: Added in v1.0, will exist for all v1.x releases.
238    # @param stream An object to write the file to.  The object must support
239    # the write method, and the tell method, similar to a file object.
240    def write(self, stream):
241        import struct
242
243        externalReferenceMap = {}
244
245        # PDF objects sometimes have circular references to their /Page objects
246        # inside their object tree (for example, annotations).  Those will be
247        # indirect references to objects that we've recreated in this PDF.  To
248        # address this problem, PageObject's store their original object
249        # reference number, and we add it to the external reference map before
250        # we sweep for indirect references.  This forces self-page-referencing
251        # trees to reference the correct new object location, rather than
252        # copying in a new copy of the page object.
253        for objIndex in xrange(len(self._objects)):
254            obj = self._objects[objIndex]
255            if isinstance(obj, PageObject) and obj.indirectRef != None:
256                data = obj.indirectRef
257                if not externalReferenceMap.has_key(data.pdf):
258                    externalReferenceMap[data.pdf] = {}
259                if not externalReferenceMap[data.pdf].has_key(data.generation):
260                    externalReferenceMap[data.pdf][data.generation] = {}
261                externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
262
263        self.stack = []
264        self._sweepIndirectReferences(externalReferenceMap, self._root)
265        del self.stack
266
267        # Begin writing:
268        object_positions = []
269        stream.write(self._header + "\n")
270        for i in range(len(self._objects)):
271            idnum = (i + 1)
272            obj = self._objects[i]
273            object_positions.append(stream.tell())
274            stream.write(str(idnum) + " 0 obj\n")
275            key = None
276            if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
277                pack1 = struct.pack("<i", i + 1)[:3]
278                pack2 = struct.pack("<i", 0)[:2]
279                key = self._encrypt_key + pack1 + pack2
280                assert len(key) == (len(self._encrypt_key) + 5)
281                md5_hash = md5(key).digest()
282                key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
283            obj.writeToStream(stream, key)
284            stream.write("\nendobj\n")
285
286        # xref table
287        xref_location = stream.tell()
288        stream.write("xref\n")
289        stream.write("0 %s\n" % (len(self._objects) + 1))
290        stream.write("%010d %05d f \n" % (0, 65535))
291        for offset in object_positions:
292            stream.write("%010d %05d n \n" % (offset, 0))
293
294        # trailer
295        stream.write("trailer\n")
296        trailer = DictionaryObject()
297        trailer.update({
298                NameObject("/Size"): NumberObject(len(self._objects) + 1),
299                NameObject("/Root"): self._root,
300                NameObject("/Info"): self._info,
301                })
302        if hasattr(self, "_ID"):
303            trailer[NameObject("/ID")] = self._ID
304        if hasattr(self, "_encrypt"):
305            trailer[NameObject("/Encrypt")] = self._encrypt
306        trailer.writeToStream(stream, None)
307
308        # eof
309        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
310
311    def _sweepIndirectReferences(self, externMap, data):
312        if isinstance(data, DictionaryObject):
313            for key, value in data.items():
314                origvalue = value
315                value = self._sweepIndirectReferences(externMap, value)
316                if isinstance(value, StreamObject):
317                    # a dictionary value is a stream.  streams must be indirect
318                    # objects, so we need to change this value.
319                    value = self._addObject(value)
320                data[key] = value
321            return data
322        elif isinstance(data, ArrayObject):
323            for i in range(len(data)):
324                value = self._sweepIndirectReferences(externMap, data[i])
325                if isinstance(value, StreamObject):
326                    # an array value is a stream.  streams must be indirect
327                    # objects, so we need to change this value
328                    value = self._addObject(value)
329                data[i] = value
330            return data
331        elif isinstance(data, IndirectObject):
332            # internal indirect references are fine
333            if data.pdf == self:
334                if data.idnum in self.stack:
335                    return data
336                else:
337                    self.stack.append(data.idnum)
338                    realdata = self.getObject(data)
339                    self._sweepIndirectReferences(externMap, realdata)
340                    self.stack.pop()
341                    return data
342            else:
343                newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
344                if newobj == None:
345                    newobj = data.pdf.getObject(data)
346                    self._objects.append(None) # placeholder
347                    idnum = len(self._objects)
348                    newobj_ido = IndirectObject(idnum, 0, self)
349                    if not externMap.has_key(data.pdf):
350                        externMap[data.pdf] = {}
351                    if not externMap[data.pdf].has_key(data.generation):
352                        externMap[data.pdf][data.generation] = {}
353                    externMap[data.pdf][data.generation][data.idnum] = newobj_ido
354                    newobj = self._sweepIndirectReferences(externMap, newobj)
355                    self._objects[idnum-1] = newobj
356                    return newobj_ido
357                return newobj
358        else:
359            return data
360
361
362##
363# Initializes a PdfFileReader object.  This operation can take some time, as
364# the PDF stream's cross-reference tables are read into memory.
365# <p>
366# Stability: Added in v1.0, will exist for all v1.x releases.
367#
368# @param stream An object that supports the standard read and seek methods
369#               similar to a file object.
370class PdfFileReader(object):
371    def __init__(self, stream):
372        self.flattenedPages = None
373        self.resolvedObjects = {}
374        self.read(stream)
375        self.stream = stream
376        self._override_encryption = False
377
378    ##
379    # Retrieves the PDF file's document information dictionary, if it exists.
380    # Note that some PDF files use metadata streams instead of docinfo
381    # dictionaries, and these metadata streams will not be accessed by this
382    # function.
383    # <p>
384    # Stability: Added in v1.6, will exist for all future v1.x releases.
385    # @return Returns a {@link #DocumentInformation DocumentInformation}
386    #         instance, or None if none exists.
387    def getDocumentInfo(self):
388        if not self.trailer.has_key("/Info"):
389            return None
390        obj = self.trailer['/Info']
391        retval = DocumentInformation()
392        retval.update(obj)
393        return retval
394
395    ##
396    # Read-only property that accesses the {@link
397    # #PdfFileReader.getDocumentInfo getDocumentInfo} function.
398    # <p>
399    # Stability: Added in v1.7, will exist for all future v1.x releases.
400    documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
401
402    ##
403    # Retrieves XMP (Extensible Metadata Platform) data from the PDF document
404    # root.
405    # <p>
406    # Stability: Added in v1.12, will exist for all future v1.x releases.
407    # @return Returns a {@link #generic.XmpInformation XmlInformation}
408    # instance that can be used to access XMP metadata from the document.
409    # Can also return None if no metadata was found on the document root.
410    def getXmpMetadata(self):
411        try:
412            self._override_encryption = True
413            return self.trailer["/Root"].getXmpMetadata()
414        finally:
415            self._override_encryption = False
416
417    ##
418    # Read-only property that accesses the {@link #PdfFileReader.getXmpData
419    # getXmpData} function.
420    # <p>
421    # Stability: Added in v1.12, will exist for all future v1.x releases.
422    xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
423
424    ##
425    # Calculates the number of pages in this PDF file.
426    # <p>
427    # Stability: Added in v1.0, will exist for all v1.x releases.
428    # @return Returns an integer.
429    def getNumPages(self):
430        if self.flattenedPages == None:
431            self._flatten()
432        return len(self.flattenedPages)
433
434    ##
435    # Read-only property that accesses the {@link #PdfFileReader.getNumPages
436    # getNumPages} function.
437    # <p>
438    # Stability: Added in v1.7, will exist for all future v1.x releases.
439    numPages = property(lambda self: self.getNumPages(), None, None)
440
441    ##
442    # Retrieves a page by number from this PDF file.
443    # <p>
444    # Stability: Added in v1.0, will exist for all v1.x releases.
445    # @return Returns a {@link #PageObject PageObject} instance.
446    def getPage(self, pageNumber):
447        ## ensure that we're not trying to access an encrypted PDF
448        #assert not self.trailer.has_key("/Encrypt")
449        if self.flattenedPages == None:
450            self._flatten()
451        return self.flattenedPages[pageNumber]
452
453    ##
454    # Read-only property that accesses the
455    # {@link #PdfFileReader.getNamedDestinations
456    # getNamedDestinations} function.
457    # <p>
458    # Stability: Added in v1.10, will exist for all future v1.x releases.
459    namedDestinations = property(lambda self:
460                                  self.getNamedDestinations(), None, None)
461
462    ##
463    # Retrieves the named destinations present in the document.
464    # <p>
465    # Stability: Added in v1.10, will exist for all future v1.x releases.
466    # @return Returns a dict which maps names to {@link #Destination
467    # destinations}.
468    def getNamedDestinations(self, tree=None, retval=None):
469        if retval == None:
470            retval = {}
471            catalog = self.trailer["/Root"]
472
473            # get the name tree
474            if catalog.has_key("/Dests"):
475                tree = catalog["/Dests"]
476            elif catalog.has_key("/Names"):
477                names = catalog['/Names']
478                if names.has_key("/Dests"):
479                    tree = names['/Dests']
480
481        if tree == None:
482            return retval
483
484        if tree.has_key("/Kids"):
485            # recurse down the tree
486            for kid in tree["/Kids"]:
487                self.getNamedDestinations(kid.getObject(), retval)
488
489        if tree.has_key("/Names"):
490            names = tree["/Names"]
491            for i in range(0, len(names), 2):
492                key = names[i].getObject()
493                val = names[i+1].getObject()
494                if isinstance(val, DictionaryObject) and val.has_key('/D'):
495                    val = val['/D']
496                dest = self._buildDestination(key, val)
497                if dest != None:
498                    retval[key] = dest
499
500        return retval
501
502    ##
503    # Read-only property that accesses the {@link #PdfFileReader.getOutlines
504    # getOutlines} function.
505    # <p>
506    # Stability: Added in v1.10, will exist for all future v1.x releases.
507    outlines = property(lambda self: self.getOutlines(), None, None)
508
509    ##
510    # Retrieves the document outline present in the document.
511    # <p>
512    # Stability: Added in v1.10, will exist for all future v1.x releases.
513    # @return Returns a nested list of {@link #Destination destinations}.
514    def getOutlines(self, node=None, outlines=None):
515        if outlines == None:
516            outlines = []
517            catalog = self.trailer["/Root"]
518
519            # get the outline dictionary and named destinations
520            if catalog.has_key("/Outlines"):
521                lines = catalog["/Outlines"]
522                if lines.has_key("/First"):
523                    node = lines["/First"]
524            self._namedDests = self.getNamedDestinations()
525
526        if node == None:
527          return outlines
528
529        # see if there are any more outlines
530        while 1:
531            outline = self._buildOutline(node)
532            if outline:
533                outlines.append(outline)
534
535            # check for sub-outlines
536            if node.has_key("/First"):
537                subOutlines = []
538                self.getOutlines(node["/First"], subOutlines)
539                if subOutlines:
540                    outlines.append(subOutlines)
541
542            if not node.has_key("/Next"):
543                break
544            node = node["/Next"]
545
546        return outlines
547
548    def _buildDestination(self, title, array):
549        page, typ = array[0:2]
550        array = array[2:]
551        return Destination(title, page, typ, *array)
552
553    def _buildOutline(self, node):
554        dest, title, outline = None, None, None
555
556        if node.has_key("/A") and node.has_key("/Title"):
557            # Action, section 8.5 (only type GoTo supported)
558            title  = node["/Title"]
559            action = node["/A"]
560            if action["/S"] == "/GoTo":
561                dest = action["/D"]
562        elif node.has_key("/Dest") and node.has_key("/Title"):
563            # Destination, section 8.2.1
564            title = node["/Title"]
565            dest  = node["/Dest"]
566
567        # if destination found, then create outline
568        if dest:
569            if isinstance(dest, ArrayObject):
570                outline = self._buildDestination(title, dest)
571            elif isinstance(dest, unicode) and self._namedDests.has_key(dest):
572                outline = self._namedDests[dest]
573                outline[NameObject("/Title")] = title
574            else:
575                raise utils.PdfReadError("Unexpected destination %r" % dest)
576        return outline
577
578    ##
579    # Read-only property that emulates a list based upon the {@link
580    # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage
581    # getPage} functions.
582    # <p>
583    # Stability: Added in v1.7, and will exist for all future v1.x releases.
584    pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
585            None, None)
586
587    def _flatten(self, pages=None, inherit=None, indirectRef=None):
588        inheritablePageAttributes = (
589            NameObject("/Resources"), NameObject("/MediaBox"),
590            NameObject("/CropBox"), NameObject("/Rotate")
591            )
592        if inherit == None:
593            inherit = dict()
594        if pages == None:
595            self.flattenedPages = []
596            catalog = self.trailer["/Root"].getObject()
597            pages = catalog["/Pages"].getObject()
598        t = pages["/Type"]
599        if t == "/Pages":
600            for attr in inheritablePageAttributes:
601                if pages.has_key(attr):
602                    inherit[attr] = pages[attr]
603            for page in pages["/Kids"]:
604                addt = {}
605                if isinstance(page, IndirectObject):
606                    addt["indirectRef"] = page
607                self._flatten(page.getObject(), inherit, **addt)
608        elif t == "/Page":
609            for attr,value in inherit.items():
610                # if the page has it's own value, it does not inherit the
611                # parent's value:
612                if not pages.has_key(attr):
613                    pages[attr] = value
614            pageObj = PageObject(self, indirectRef)
615            pageObj.update(pages)
616            self.flattenedPages.append(pageObj)
617
618    def getObject(self, indirectReference):
619        retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None)
620        if retval != None:
621            return retval
622        if indirectReference.generation == 0 and \
623           self.xref_objStm.has_key(indirectReference.idnum):
624            # indirect reference to object in object stream
625            # read the entire object stream into memory
626            stmnum,idx = self.xref_objStm[indirectReference.idnum]
627            objStm = IndirectObject(stmnum, 0, self).getObject()
628            assert objStm['/Type'] == '/ObjStm'
629            assert idx < objStm['/N']
630            streamData = StringIO(objStm.getData())
631            for i in range(objStm['/N']):
632                objnum = NumberObject.readFromStream(streamData)
633                readNonWhitespace(streamData)
634                streamData.seek(-1, 1)
635                offset = NumberObject.readFromStream(streamData)
636                readNonWhitespace(streamData)
637                streamData.seek(-1, 1)
638                t = streamData.tell()
639                streamData.seek(objStm['/First']+offset, 0)
640                obj = readObject(streamData, self)
641                self.resolvedObjects[0][objnum] = obj
642                streamData.seek(t, 0)
643            return self.resolvedObjects[0][indirectReference.idnum]
644        start = self.xref[indirectReference.generation][indirectReference.idnum]
645        self.stream.seek(start, 0)
646        idnum, generation = self.readObjectHeader(self.stream)
647        assert idnum == indirectReference.idnum
648        assert generation == indirectReference.generation
649        retval = readObject(self.stream, self)
650
651        # override encryption is used for the /Encrypt dictionary
652        if not self._override_encryption and self.isEncrypted:
653            # if we don't have the encryption key:
654            if not hasattr(self, '_decryption_key'):
655                raise Exception, "file has not been decrypted"
656            # otherwise, decrypt here...
657            import struct
658            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
659            pack2 = struct.pack("<i", indirectReference.generation)[:2]
660            key = self._decryption_key + pack1 + pack2
661            assert len(key) == (len(self._decryption_key) + 5)
662            md5_hash = md5(key).digest()
663            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
664            retval = self._decryptObject(retval, key)
665
666        self.cacheIndirectObject(generation, idnum, retval)
667        return retval
668
669    def _decryptObject(self, obj, key):
670        if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
671            obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
672        elif isinstance(obj, StreamObject):
673            obj._data = utils.RC4_encrypt(key, obj._data)
674        elif isinstance(obj, DictionaryObject):
675            for dictkey, value in obj.items():
676                obj[dictkey] = self._decryptObject(value, key)
677        elif isinstance(obj, ArrayObject):
678            for i in range(len(obj)):
679                obj[i] = self._decryptObject(obj[i], key)
680        return obj
681
682    def readObjectHeader(self, stream):
683        # Should never be necessary to read out whitespace, since the
684        # cross-reference table should put us in the right spot to read the
685        # object header.  In reality... some files have stupid cross reference
686        # tables that are off by whitespace bytes.
687        readNonWhitespace(stream); stream.seek(-1, 1)
688        idnum = readUntilWhitespace(stream)
689        generation = readUntilWhitespace(stream)
690        obj = stream.read(3)
691        readNonWhitespace(stream)
692        stream.seek(-1, 1)
693        return int(idnum), int(generation)
694
695    def cacheIndirectObject(self, generation, idnum, obj):
696        if not self.resolvedObjects.has_key(generation):
697            self.resolvedObjects[generation] = {}
698        self.resolvedObjects[generation][idnum] = obj
699
700    def read(self, stream):
701        # start at the end:
702        stream.seek(-1, 2)
703        line = ''
704        while not line:
705            line = self.readNextEndLine(stream)
706        if line[:5] != "%%EOF":
707            raise utils.PdfReadError, "EOF marker not found"
708
709        # find startxref entry - the location of the xref table
710        line = self.readNextEndLine(stream)
711        startxref = int(line)
712        line = self.readNextEndLine(stream)
713        if line[:9] != "startxref":
714            raise utils.PdfReadError, "startxref not found"
715
716        # read all cross reference tables and their trailers
717        self.xref = {}
718        self.xref_objStm = {}
719        self.trailer = DictionaryObject()
720        while 1:
721            # load the xref table
722            stream.seek(startxref, 0)
723            x = stream.read(1)
724            if x == "x":
725                # standard cross-reference table
726                ref = stream.read(4)
727                if ref[:3] != "ref":
728                    raise utils.PdfReadError, "xref table read error"
729                readNonWhitespace(stream)
730                stream.seek(-1, 1)
731                while 1:
732                    num = readObject(stream, self)
733                    readNonWhitespace(stream)
734                    stream.seek(-1, 1)
735                    size = readObject(stream, self)
736                    readNonWhitespace(stream)
737                    stream.seek(-1, 1)
738                    cnt = 0
739                    while cnt < size:
740                        line = stream.read(20)
741                        # It's very clear in section 3.4.3 of the PDF spec
742                        # that all cross-reference table lines are a fixed
743                        # 20 bytes.  However... some malformed PDF files
744                        # use a single character EOL without a preceeding
745                        # space.  Detect that case, and seek the stream
746                        # back one character.  (0-9 means we've bled into
747                        # the next xref entry, t means we've bled into the
748                        # text "trailer"):
749                        if line[-1] in "0123456789t":
750                            stream.seek(-1, 1)
751                        offset, generation = line[:16].split(" ")
752                        offset, generation = int(offset), int(generation)
753                        if not self.xref.has_key(generation):
754                            self.xref[generation] = {}
755                        if self.xref[generation].has_key(num):
756                            # It really seems like we should allow the last
757                            # xref table in the file to override previous
758                            # ones. Since we read the file backwards, assume
759                            # any existing key is already set correctly.
760                            pass
761                        else:
762                            self.xref[generation][num] = offset
763                        cnt += 1
764                        num += 1
765                    readNonWhitespace(stream)
766                    stream.seek(-1, 1)
767                    trailertag = stream.read(7)
768                    if trailertag != "trailer":
769                        # more xrefs!
770                        stream.seek(-7, 1)
771                    else:
772                        break
773                readNonWhitespace(stream)
774                stream.seek(-1, 1)
775                newTrailer = readObject(stream, self)
776                for key, value in newTrailer.items():
777                    if not self.trailer.has_key(key):
778                        self.trailer[key] = value
779                if newTrailer.has_key("/Prev"):
780                    startxref = newTrailer["/Prev"]
781                else:
782                    break
783            elif x.isdigit():
784                # PDF 1.5+ Cross-Reference Stream
785                stream.seek(-1, 1)
786                idnum, generation = self.readObjectHeader(stream)
787                xrefstream = readObject(stream, self)
788                assert xrefstream["/Type"] == "/XRef"
789                self.cacheIndirectObject(generation, idnum, xrefstream)
790                streamData = StringIO(xrefstream.getData())
791                idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
792                entrySizes = xrefstream.get("/W")
793                for num, size in self._pairs(idx_pairs):
794                    cnt = 0
795                    while cnt < size:
796                        for i in range(len(entrySizes)):
797                            d = streamData.read(entrySizes[i])
798                            di = convertToInt(d, entrySizes[i])
799                            if i == 0:
800                                xref_type = di
801                            elif i == 1:
802                                if xref_type == 0:
803                                    next_free_object = di
804                                elif xref_type == 1:
805                                    byte_offset = di
806                                elif xref_type == 2:
807                                    objstr_num = di
808                            elif i == 2:
809                                if xref_type == 0:
810                                    next_generation = di
811                                elif xref_type == 1:
812                                    generation = di
813                                elif xref_type == 2:
814                                    obstr_idx = di
815                        if xref_type == 0:
816                            pass
817                        elif xref_type == 1:
818                            if not self.xref.has_key(generation):
819                                self.xref[generation] = {}
820                            if not num in self.xref[generation]:
821                                self.xref[generation][num] = byte_offset
822                        elif xref_type == 2:
823                            if not num in self.xref_objStm:
824                                self.xref_objStm[num] = [objstr_num, obstr_idx]
825                        cnt += 1
826                        num += 1
827                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
828                for key in trailerKeys:
829                    if xrefstream.has_key(key) and not self.trailer.has_key(key):
830                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
831                if xrefstream.has_key("/Prev"):
832                    startxref = xrefstream["/Prev"]
833                else:
834                    break
835            else:
836                # bad xref character at startxref.  Let's see if we can find
837                # the xref table nearby, as we've observed this error with an
838                # off-by-one before.
839                stream.seek(-11, 1)
840                tmp = stream.read(20)
841                xref_loc = tmp.find("xref")
842                if xref_loc != -1:
843                    startxref -= (10 - xref_loc)
844                    continue
845                else:
846                    # no xref table found at specified location
847                    assert False
848                    break
849
850    def _pairs(self, array):
851        i = 0
852        while True:
853            yield array[i], array[i+1]
854            i += 2
855            if (i+1) >= len(array):
856                break
857
858    def readNextEndLine(self, stream):
859        line = ""
860        while True:
861            x = stream.read(1)
862            stream.seek(-2, 1)
863            if x == '\n' or x == '\r':
864                while x == '\n' or x == '\r':
865                    x = stream.read(1)
866                    stream.seek(-2, 1)
867                stream.seek(1, 1)
868                break
869            else:
870                line = x + line
871        return line
872
873    ##
874    # When using an encrypted / secured PDF file with the PDF Standard
875    # encryption handler, this function will allow the file to be decrypted.
876    # It checks the given password against the document's user password and
877    # owner password, and then stores the resulting decryption key if either
878    # password is correct.
879    # <p>
880    # It does not matter which password was matched.  Both passwords provide
881    # the correct decryption key that will allow the document to be used with
882    # this library.
883    # <p>
884    # Stability: Added in v1.8, will exist for all future v1.x releases.
885    #
886    # @return 0 if the password failed, 1 if the password matched the user
887    # password, and 2 if the password matched the owner password.
888    #
889    # @exception NotImplementedError Document uses an unsupported encryption
890    # method.
891    def decrypt(self, password):
892        self._override_encryption = True
893        try:
894            return self._decrypt(password)
895        finally:
896            self._override_encryption = False
897
898    def _decrypt(self, password):
899        encrypt = self.trailer['/Encrypt'].getObject()
900        if encrypt['/Filter'] != '/Standard':
901            raise NotImplementedError, "only Standard PDF encryption handler is available"
902        if not (encrypt['/V'] in (1, 2)):
903            raise NotImplementedError, "only algorithm code 1 and 2 are supported"
904        user_password, key = self._authenticateUserPassword(password)
905        if user_password:
906            self._decryption_key = key
907            return 1
908        else:
909            rev = encrypt['/R'].getObject()
910            if rev == 2:
911                keylen = 5
912            else:
913                keylen = encrypt['/Length'].getObject() / 8
914            key = _alg33_1(password, rev, keylen)
915            real_O = encrypt["/O"].getObject()
916            if rev == 2:
917                userpass = utils.RC4_encrypt(key, real_O)
918            else:
919                val = real_O
920                for i in range(19, -1, -1):
921                    new_key = ''
922                    for l in range(len(key)):
923                        new_key += chr(ord(key[l]) ^ i)
924                    val = utils.RC4_encrypt(new_key, val)
925                userpass = val
926            owner_password, key = self._authenticateUserPassword(userpass)
927            if owner_password:
928                self._decryption_key = key
929                return 2
930        return 0
931
932    def _authenticateUserPassword(self, password):
933        encrypt = self.trailer['/Encrypt'].getObject()
934        rev = encrypt['/R'].getObject()
935        owner_entry = encrypt['/O'].getObject().original_bytes
936        p_entry = encrypt['/P'].getObject()
937        id_entry = self.trailer['/ID'].getObject()
938        id1_entry = id_entry[0].getObject()
939        if rev == 2:
940            U, key = _alg34(password, owner_entry, p_entry, id1_entry)
941        elif rev >= 3:
942            U, key = _alg35(password, rev,
943                    encrypt["/Length"].getObject() / 8, owner_entry,
944                    p_entry, id1_entry,
945                    encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
946        real_U = encrypt['/U'].getObject().original_bytes
947        return U == real_U, key
948
949    def getIsEncrypted(self):
950        return self.trailer.has_key("/Encrypt")
951
952    ##
953    # Read-only boolean property showing whether this PDF file is encrypted.
954    # Note that this property, if true, will remain true even after the {@link
955    # #PdfFileReader.decrypt decrypt} function is called.
956    isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
957
958
959def getRectangle(self, name, defaults):
960    retval = self.get(name)
961    if isinstance(retval, RectangleObject):
962        return retval
963    if retval == None:
964        for d in defaults:
965            retval = self.get(d)
966            if retval != None:
967                break
968    if isinstance(retval, IndirectObject):
969        retval = self.pdf.getObject(retval)
970    retval = RectangleObject(retval)
971    setRectangle(self, name, retval)
972    return retval
973
974def setRectangle(self, name, value):
975    if not isinstance(name, NameObject):
976        name = NameObject(name)
977    self[name] = value
978
979def deleteRectangle(self, name):
980    del self[name]
981
982def createRectangleAccessor(name, fallback):
983    return \
984        property(
985            lambda self: getRectangle(self, name, fallback),
986            lambda self, value: setRectangle(self, name, value),
987            lambda self: deleteRectangle(self, name)
988            )
989
990##
991# This class represents a single page within a PDF file.  Typically this object
992# will be created by accessing the {@link #PdfFileReader.getPage getPage}
993# function of the {@link #PdfFileReader PdfFileReader} class, but it is
994# also possible to create an empty page with the createBlankPage static
995# method.
996# @param pdf PDF file the page belongs to (optional, defaults to None).
997class PageObject(DictionaryObject):
998    def __init__(self, pdf=None, indirectRef=None):
999        DictionaryObject.__init__(self)
1000        self.pdf = pdf
1001        # Stores the original indirect reference to this object in its source PDF
1002        self.indirectRef = indirectRef
1003
1004    ##
1005    # Returns a new blank page.
1006    # If width or height is None, try to get the page size from the
1007    # last page of pdf. If pdf is None or contains no page, a
1008    # PageSizeNotDefinedError is raised.
1009    # @param pdf    PDF file the page belongs to
1010    # @param width  The width of the new page expressed in default user
1011    #               space units.
1012    # @param height The height of the new page expressed in default user
1013    #               space units.
1014    def createBlankPage(pdf=None, width=None, height=None):
1015        page = PageObject(pdf)
1016
1017        # Creates a new page (cf PDF Reference  7.7.3.3)
1018        page.__setitem__(NameObject('/Type'), NameObject('/Page'))
1019        page.__setitem__(NameObject('/Parent'), NullObject())
1020        page.__setitem__(NameObject('/Resources'), DictionaryObject())
1021        if width is None or height is None:
1022            if pdf is not None and pdf.getNumPages() > 0:
1023                lastpage = pdf.getPage(pdf.getNumPages() - 1)
1024                width = lastpage.mediaBox.getWidth()
1025                height = lastpage.mediaBox.getHeight()
1026            else:
1027                raise utils.PageSizeNotDefinedError()
1028        page.__setitem__(NameObject('/MediaBox'),
1029            RectangleObject([0, 0, width, height]))
1030
1031        return page
1032    createBlankPage = staticmethod(createBlankPage)
1033
1034    ##
1035    # Rotates a page clockwise by increments of 90 degrees.
1036    # <p>
1037    # Stability: Added in v1.1, will exist for all future v1.x releases.
1038    # @param angle Angle to rotate the page.  Must be an increment of 90 deg.
1039    def rotateClockwise(self, angle):
1040        assert angle % 90 == 0
1041        self._rotate(angle)
1042        return self
1043
1044    ##
1045    # Rotates a page counter-clockwise by increments of 90 degrees.
1046    # <p>
1047    # Stability: Added in v1.1, will exist for all future v1.x releases.
1048    # @param angle Angle to rotate the page.  Must be an increment of 90 deg.
1049    def rotateCounterClockwise(self, angle):
1050        assert angle % 90 == 0
1051        self._rotate(-angle)
1052        return self
1053
1054    def _rotate(self, angle):
1055        currentAngle = self.get("/Rotate", 0)
1056        self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
1057
1058    def _mergeResources(res1, res2, resource):
1059        newRes = DictionaryObject()
1060        newRes.update(res1.get(resource, DictionaryObject()).getObject())
1061        page2Res = res2.get(resource, DictionaryObject()).getObject()
1062        renameRes = {}
1063        for key in page2Res.keys():
1064            if newRes.has_key(key) and newRes[key] != page2Res[key]:
1065                newname = NameObject(key + "renamed")
1066                renameRes[key] = newname
1067                newRes[newname] = page2Res[key]
1068            elif not newRes.has_key(key):
1069                newRes[key] = page2Res.raw_get(key)
1070        return newRes, renameRes
1071    _mergeResources = staticmethod(_mergeResources)
1072
1073    def _contentStreamRename(stream, rename, pdf):
1074        if not rename:
1075            return stream
1076        stream = ContentStream(stream, pdf)
1077        for operands,operator in stream.operations:
1078            for i in range(len(operands)):
1079                op = operands[i]
1080                if isinstance(op, NameObject):
1081                    operands[i] = rename.get(op, op)
1082        return stream
1083    _contentStreamRename = staticmethod(_contentStreamRename)
1084
1085    def _pushPopGS(contents, pdf):
1086        # adds a graphics state "push" and "pop" to the beginning and end
1087        # of a content stream.  This isolates it from changes such as
1088        # transformation matricies.
1089        stream = ContentStream(contents, pdf)
1090        stream.operations.insert(0, [[], "q"])
1091        stream.operations.append([[], "Q"])
1092        return stream
1093    _pushPopGS = staticmethod(_pushPopGS)
1094
1095    def _addTransformationMatrix(contents, pdf, ctm):
1096        # adds transformation matrix at the beginning of the given
1097        # contents stream.
1098        a, b, c, d, e, f = ctm
1099        contents = ContentStream(contents, pdf)
1100        contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
1101            FloatObject(c), FloatObject(d), FloatObject(e),
1102            FloatObject(f)], " cm"])
1103        return contents
1104    _addTransformationMatrix = staticmethod(_addTransformationMatrix)
1105
1106    ##
1107    # Returns the /Contents object, or None if it doesn't exist.
1108    # /Contents is optionnal, as described in PDF Reference  7.7.3.3
1109    def getContents(self):
1110      if self.has_key("/Contents"):
1111        return self["/Contents"].getObject()
1112      else:
1113        return None
1114
1115    ##
1116    # Merges the content streams of two pages into one.  Resource references
1117    # (i.e. fonts) are maintained from both pages.  The mediabox/cropbox/etc
1118    # of this page are not altered.  The parameter page's content stream will
1119    # be added to the end of this page's content stream, meaning that it will
1120    # be drawn after, or "on top" of this page.
1121    # <p>
1122    # Stability: Added in v1.4, will exist for all future 1.x releases.
1123    # @param page2 An instance of {@link #PageObject PageObject} to be merged
1124    #              into this one.
1125    def mergePage(self, page2):
1126        self._mergePage(page2)
1127
1128    ##
1129    # Actually merges the content streams of two pages into one. Resource
1130    # references (i.e. fonts) are maintained from both pages. The
1131    # mediabox/cropbox/etc of this page are not altered. The parameter page's
1132    # content stream will be added to the end of this page's content stream,
1133    # meaning that it will be drawn after, or "on top" of this page.
1134    #
1135    # @param page2 An instance of {@link #PageObject PageObject} to be merged
1136    #              into this one.
1137    # @param page2transformation A fuction which applies a transformation to
1138    #                            the content stream of page2. Takes: page2
1139    #                            contents stream. Must return: new contents
1140    #                            stream. If omitted, the content stream will
1141    #                            not be modified.
1142    def _mergePage(self, page2, page2transformation=None):
1143        # First we work on merging the resource dictionaries.  This allows us
1144        # to find out what symbols in the content streams we might need to
1145        # rename.
1146
1147        newResources = DictionaryObject()
1148        rename = {}
1149        originalResources = self["/Resources"].getObject()
1150        page2Resources = page2["/Resources"].getObject()
1151
1152        for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
1153            new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
1154            if new:
1155                newResources[NameObject(res)] = new
1156                rename.update(newrename)
1157
1158        # Combine /ProcSet sets.
1159        newResources[NameObject("/ProcSet")] = ArrayObject(
1160            frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
1161                frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
1162            )
1163        )
1164
1165        newContentArray = ArrayObject()
1166
1167        originalContent = self.getContents()
1168        if originalContent is not None:
1169            newContentArray.append(PageObject._pushPopGS(
1170                  originalContent, self.pdf))
1171
1172        page2Content = page2.getContents()
1173        if page2Content is not None:
1174            if page2transformation is not None:
1175                page2Content = page2transformation(page2Content)
1176            page2Content = PageObject._contentStreamRename(
1177                page2Content, rename, self.pdf)
1178            page2Content = PageObject._pushPopGS(page2Content, self.pdf)
1179            newContentArray.append(page2Content)
1180
1181        self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
1182        self[NameObject('/Resources')] = newResources
1183
1184    ##
1185    # This is similar to mergePage, but a transformation matrix is
1186    # applied to the merged stream.
1187    #
1188    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1189    # @param ctm   A 6 elements tuple containing the operands of the
1190    #              transformation matrix
1191    def mergeTransformedPage(self, page2, ctm):
1192        self._mergePage(page2, lambda page2Content:
1193            PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm))
1194
1195    ##
1196    # This is similar to mergePage, but the stream to be merged is scaled
1197    # by appling a transformation matrix.
1198    #
1199    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1200    # @param factor The scaling factor
1201    def mergeScaledPage(self, page2, factor):
1202        # CTM to scale : [ sx 0 0 sy 0 0 ]
1203        return self.mergeTransformedPage(page2, [factor, 0,
1204                                                 0,      factor,
1205                                                 0,      0])
1206
1207    ##
1208    # This is similar to mergePage, but the stream to be merged is rotated
1209    # by appling a transformation matrix.
1210    #
1211    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1212    # @param rotation The angle of the rotation, in degrees
1213    def mergeRotatedPage(self, page2, rotation):
1214        rotation = math.radians(rotation)
1215        return self.mergeTransformedPage(page2,
1216            [math.cos(rotation),  math.sin(rotation),
1217             -math.sin(rotation), math.cos(rotation),
1218             0,                   0])
1219
1220    ##
1221    # This is similar to mergePage, but the stream to be merged is translated
1222    # by appling a transformation matrix.
1223    #
1224    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1225    # @param tx    The translation on X axis
1226    # @param tx    The translation on Y axis
1227    def mergeTranslatedPage(self, page2, tx, ty):
1228        return self.mergeTransformedPage(page2, [1,  0,
1229                                                 0,  1,
1230                                                 tx, ty])
1231
1232    ##
1233    # This is similar to mergePage, but the stream to be merged is rotated
1234    # and scaled by appling a transformation matrix.
1235    #
1236    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1237    # @param rotation The angle of the rotation, in degrees
1238    # @param factor The scaling factor
1239    def mergeRotatedScaledPage(self, page2, rotation, scale):
1240        rotation = math.radians(rotation)
1241        rotating = [[math.cos(rotation), math.sin(rotation),0],
1242                    [-math.sin(rotation),math.cos(rotation), 0],
1243                    [0,                  0,                  1]]
1244        scaling = [[scale,0,    0],
1245                   [0,    scale,0],
1246                   [0,    0,    1]]
1247        ctm = utils.matrixMultiply(rotating, scaling)
1248
1249        return self.mergeTransformedPage(page2,
1250                                         [ctm[0][0], ctm[0][1],
1251                                          ctm[1][0], ctm[1][1],
1252                                          ctm[2][0], ctm[2][1]])
1253
1254    ##
1255    # This is similar to mergePage, but the stream to be merged is translated
1256    # and scaled by appling a transformation matrix.
1257    #
1258    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1259    # @param scale The scaling factor
1260    # @param tx    The translation on X axis
1261    # @param tx    The translation on Y axis
1262    def mergeScaledTranslatedPage(self, page2, scale, tx, ty):
1263        translation = [[1, 0, 0],
1264                       [0, 1, 0],
1265                       [tx,ty,1]]
1266        scaling = [[scale,0,    0],
1267                   [0,    scale,0],
1268                   [0,    0,    1]]
1269        ctm = utils.matrixMultiply(scaling, translation)
1270
1271        return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
1272                                                 ctm[1][0], ctm[1][1],
1273                                                 ctm[2][0], ctm[2][1]])
1274
1275    ##
1276    # This is similar to mergePage, but the stream to be merged is translated,
1277    # rotated and scaled by appling a transformation matrix.
1278    #
1279    # @param page2 An instance of {@link #PageObject PageObject} to be merged.
1280    # @param tx    The translation on X axis
1281    # @param ty    The translation on Y axis
1282    # @param rotation The angle of the rotation, in degrees
1283    # @param scale The scaling factor
1284    def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty):
1285        translation = [[1, 0, 0],
1286                       [0, 1, 0],
1287                       [tx,ty,1]]
1288        rotation = math.radians(rotation)
1289        rotating = [[math.cos(rotation), math.sin(rotation),0],
1290                    [-math.sin(rotation),math.cos(rotation), 0],
1291                    [0,                  0,                  1]]
1292        scaling = [[scale,0,    0],
1293                   [0,    scale,0],
1294                   [0,    0,    1]]
1295        ctm = utils.matrixMultiply(rotating, scaling)
1296        ctm = utils.matrixMultiply(ctm, translation)
1297
1298        return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
1299                                                 ctm[1][0], ctm[1][1],
1300                                                 ctm[2][0], ctm[2][1]])
1301
1302    ##
1303    # Applys a transformation matrix the page.
1304    #
1305    # @param ctm   A 6 elements tuple containing the operands of the
1306    #              transformation matrix
1307    def addTransformation(self, ctm):
1308        originalContent = self.getContents()
1309        if originalContent is not None:
1310            newContent = PageObject._addTransformationMatrix(
1311                originalContent, self.pdf, ctm)
1312            newContent = PageObject._pushPopGS(newContent, self.pdf)
1313            self[NameObject('/Contents')] = newContent
1314
1315    ##
1316    # Scales a page by the given factors by appling a transformation
1317    # matrix to its content and updating the page size.
1318    #
1319    # @param sx The scaling factor on horizontal axis
1320    # @param sy The scaling factor on vertical axis
1321    def scale(self, sx, sy):
1322        self.addTransformation([sx, 0,
1323                                0,  sy,
1324                                0,  0])
1325        self.mediaBox = RectangleObject([
1326            float(self.mediaBox.getLowerLeft_x()) * sx,
1327            float(self.mediaBox.getLowerLeft_y()) * sy,
1328            float(self.mediaBox.getUpperRight_x()) * sx,
1329            float(self.mediaBox.getUpperRight_y()) * sy])
1330
1331    ##
1332    # Scales a page by the given factor by appling a transformation
1333    # matrix to its content and updating the page size.
1334    #
1335    # @param factor The scaling factor
1336    def scaleBy(self, factor):
1337        self.scale(factor, factor)
1338
1339    ##
1340    # Scales a page to the specified dimentions by appling a
1341    # transformation matrix to its content and updating the page size.
1342    #
1343    # @param width The new width
1344    # @param height The new heigth
1345    def scaleTo(self, width, height):
1346        sx = width / (self.mediaBox.getUpperRight_x() -
1347                      self.mediaBox.getLowerLeft_x ())
1348        sy = height / (self.mediaBox.getUpperRight_y() -
1349                       self.mediaBox.getLowerLeft_x ())
1350        self.scale(sx, sy)
1351
1352    ##
1353    # Compresses the size of this page by joining all content streams and
1354    # applying a FlateDecode filter.
1355    # <p>
1356    # Stability: Added in v1.6, will exist for all future v1.x releases.
1357    # However, it is possible that this function will perform no action if
1358    # content stream compression becomes "automatic" for some reason.
1359    def compressContentStreams(self):
1360        content = self.getContents()
1361        if content is not None:
1362            if not isinstance(content, ContentStream):
1363                content = ContentStream(content, self.pdf)
1364            self[NameObject("/Contents")] = content.flateEncode()
1365
1366    ##
1367    # Locate all text drawing commands, in the order they are provided in the
1368    # content stream, and extract the text.  This works well for some PDF
1369    # files, but poorly for others, depending on the generator used.  This will
1370    # be refined in the future.  Do not rely on the order of text coming out of
1371    # this function, as it will change if this function is made more
1372    # sophisticated.
1373    # <p>
1374    # Stability: Added in v1.7, will exist for all future v1.x releases.  May
1375    # be overhauled to provide more ordered text in the future.
1376    # @return a unicode string object
1377    def extractText(self):
1378        text = u""
1379        content = self["/Contents"].getObject()
1380        if not isinstance(content, ContentStream):
1381            content = ContentStream(content, self.pdf)
1382        # Note: we check all strings are TextStringObjects.  ByteStringObjects
1383        # are strings where the byte->string encoding was unknown, so adding
1384        # them to the text here would be gibberish.
1385        for operands,operator in content.operations:
1386            if operator == "Tj":
1387                _text = operands[0]
1388                if isinstance(_text, TextStringObject):
1389                    text += _text
1390            elif operator == "T*":
1391                text += "\n"
1392            elif operator == "'":
1393                text += "\n"
1394                _text = operands[0]
1395                if isinstance(_text, TextStringObject):
1396                    text += operands[0]
1397            elif operator == '"':
1398                _text = operands[2]
1399                if isinstance(_text, TextStringObject):
1400                    text += "\n"
1401                    text += _text
1402            elif operator == "TJ":
1403                for i in operands[0]:
1404                    if isinstance(i, TextStringObject):
1405                        text += i
1406        return text
1407
1408    ##
1409    # A rectangle (RectangleObject), expressed in default user space units,
1410    # defining the boundaries of the physical medium on which the page is
1411    # intended to be displayed or printed.
1412    # <p>
1413    # Stability: Added in v1.4, will exist for all future v1.x releases.
1414    mediaBox = createRectangleAccessor("/MediaBox", ())
1415
1416    ##
1417    # A rectangle (RectangleObject), expressed in default user space units,
1418    # defining the visible region of default user space.  When the page is
1419    # displayed or printed, its contents are to be clipped (cropped) to this
1420    # rectangle and then imposed on the output medium in some
1421    # implementation-defined manner.  Default value: same as MediaBox.
1422    # <p>
1423    # Stability: Added in v1.4, will exist for all future v1.x releases.
1424    cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
1425
1426    ##
1427    # A rectangle (RectangleObject), expressed in default user space units,
1428    # defining the region to which the contents of the page should be clipped
1429    # when output in a production enviroment.
1430    # <p>
1431    # Stability: Added in v1.4, will exist for all future v1.x releases.
1432    bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
1433
1434    ##
1435    # A rectangle (RectangleObject), expressed in default user space units,
1436    # defining the intended dimensions of the finished page after trimming.
1437    # <p>
1438    # Stability: Added in v1.4, will exist for all future v1.x releases.
1439    trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
1440
1441    ##
1442    # A rectangle (RectangleObject), expressed in default user space units,
1443    # defining the extent of the page's meaningful content as intended by the
1444    # page's creator.
1445    # <p>
1446    # Stability: Added in v1.4, will exist for all future v1.x releases.
1447    artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
1448
1449
1450class ContentStream(DecodedStreamObject):
1451    def __init__(self, stream, pdf):
1452        self.pdf = pdf
1453        self.operations = []
1454        # stream may be a StreamObject or an ArrayObject containing
1455        # multiple StreamObjects to be cat'd together.
1456        stream = stream.getObject()
1457        if isinstance(stream, ArrayObject):
1458            data = ""
1459            for s in stream:
1460                data += s.getObject().getData()
1461            stream = StringIO(data)
1462        else:
1463            stream = StringIO(stream.getData())
1464        self.__parseContentStream(stream)
1465
1466    def __parseContentStream(self, stream):
1467        # file("f:\\tmp.txt", "w").write(stream.read())
1468        stream.seek(0, 0)
1469        operands = []
1470        while True:
1471            peek = readNonWhitespace(stream)
1472            if peek == '':
1473                break
1474            stream.seek(-1, 1)
1475            if peek.isalpha() or peek == "'" or peek == '"':
1476                operator = ""
1477                while True:
1478                    tok = stream.read(1)
1479                    if tok.isspace() or tok in NameObject.delimiterCharacters:
1480                        stream.seek(-1, 1)
1481                        break
1482                    elif tok == '':
1483                        break
1484                    operator += tok
1485                if operator == "BI":
1486                    # begin inline image - a completely different parsing
1487                    # mechanism is required, of course... thanks buddy...
1488                    assert operands == []
1489                    ii = self._readInlineImage(stream)
1490                    self.operations.append((ii, "INLINE IMAGE"))
1491                else:
1492                    self.operations.append((operands, operator))
1493                    operands = []
1494            elif peek == '%':
1495                # If we encounter a comment in the content stream, we have to
1496                # handle it here.  Typically, readObject will handle
1497                # encountering a comment -- but readObject assumes that
1498                # following the comment must be the object we're trying to
1499                # read.  In this case, it could be an operator instead.
1500                while peek not in ('\r', '\n'):
1501                    peek = stream.read(1)
1502            else:
1503                operands.append(readObject(stream, None))
1504
1505    def _readInlineImage(self, stream):
1506        # begin reading just after the "BI" - begin image
1507        # first read the dictionary of settings.
1508        settings = DictionaryObject()
1509        while True:
1510            tok = readNonWhitespace(stream)
1511            stream.seek(-1, 1)
1512            if tok == "I":
1513                # "ID" - begin of image data
1514                break
1515            key = readObject(stream, self.pdf)
1516            tok = readNonWhitespace(stream)
1517            stream.seek(-1, 1)
1518            value = readObject(stream, self.pdf)
1519            settings[key] = value
1520        # left at beginning of ID
1521        tmp = stream.read(3)
1522        assert tmp[:2] == "ID"
1523        data = ""
1524        while True:
1525            tok = stream.read(1)
1526            if tok == "E":
1527                next = stream.read(1)
1528                if next == "I":
1529                    break
1530                else:
1531                    stream.seek(-1, 1)
1532                    data += tok
1533            else:
1534                data += tok
1535        x = readNonWhitespace(stream)
1536        stream.seek(-1, 1)
1537        return {"settings": settings, "data": data}
1538
1539    def _getData(self):
1540        newdata = StringIO()
1541        for operands,operator in self.operations:
1542            if operator == "INLINE IMAGE":
1543                newdata.write("BI")
1544                dicttext = StringIO()
1545                operands["settings"].writeToStream(dicttext, None)
1546                newdata.write(dicttext.getvalue()[2:-2])
1547                newdata.write("ID ")
1548                newdata.write(operands["data"])
1549                newdata.write("EI")
1550            else:
1551                for op in operands:
1552                    op.writeToStream(newdata, None)
1553                    newdata.write(" ")
1554                newdata.write(operator)
1555            newdata.write("\n")
1556        return newdata.getvalue()
1557
1558    def _setData(self, value):
1559        self.__parseContentStream(StringIO(value))
1560
1561    _data = property(_getData, _setData)
1562
1563
1564##
1565# A class representing the basic document metadata provided in a PDF File.
1566# <p>
1567# As of pyPdf v1.10, all text properties of the document metadata have two
1568# properties, eg. author and author_raw.  The non-raw property will always
1569# return a TextStringObject, making it ideal for a case where the metadata is
1570# being displayed.  The raw property can sometimes return a ByteStringObject,
1571# if pyPdf was unable to decode the string's text encoding; this requires
1572# additional safety in the caller and therefore is not as commonly accessed.
1573class DocumentInformation(DictionaryObject):
1574    def __init__(self):
1575        DictionaryObject.__init__(self)
1576
1577    def getText(self, key):
1578        retval = self.get(key, None)
1579        if isinstance(retval, TextStringObject):
1580            return retval
1581        return None
1582
1583    ##
1584    # Read-only property accessing the document's title.  Added in v1.6, will
1585    # exist for all future v1.x releases.  Modified in v1.10 to always return a
1586    # unicode string (TextStringObject).
1587    # @return A unicode string, or None if the title is not provided.
1588    title = property(lambda self: self.getText("/Title"))
1589    title_raw = property(lambda self: self.get("/Title"))
1590
1591    ##
1592    # Read-only property accessing the document's author.  Added in v1.6, will
1593    # exist for all future v1.x releases.  Modified in v1.10 to always return a
1594    # unicode string (TextStringObject).
1595    # @return A unicode string, or None if the author is not provided.
1596    author = property(lambda self: self.getText("/Author"))
1597    author_raw = property(lambda self: self.get("/Author"))
1598
1599    ##
1600    # Read-only property accessing the subject of the document.  Added in v1.6,
1601    # will exist for all future v1.x releases.  Modified in v1.10 to always
1602    # return a unicode string (TextStringObject).
1603    # @return A unicode string, or None if the subject is not provided.
1604    subject = property(lambda self: self.getText("/Subject"))
1605    subject_raw = property(lambda self: self.get("/Subject"))
1606
1607    ##
1608    # Read-only property accessing the document's creator.  If the document was
1609    # converted to PDF from another format, the name of the application (for
1610    # example, OpenOffice) that created the original document from which it was
1611    # converted.  Added in v1.6, will exist for all future v1.x releases.
1612    # Modified in v1.10 to always return a unicode string (TextStringObject).
1613    # @return A unicode string, or None if the creator is not provided.
1614    creator = property(lambda self: self.getText("/Creator"))
1615    creator_raw = property(lambda self: self.get("/Creator"))
1616
1617    ##
1618    # Read-only property accessing the document's producer.  If the document
1619    # was converted to PDF from another format, the name of the application
1620    # (for example, OSX Quartz) that converted it to PDF.  Added in v1.6, will
1621    # exist for all future v1.x releases.  Modified in v1.10 to always return a
1622    # unicode string (TextStringObject).
1623    # @return A unicode string, or None if the producer is not provided.
1624    producer = property(lambda self: self.getText("/Producer"))
1625    producer_raw = property(lambda self: self.get("/Producer"))
1626
1627
1628##
1629# A class representing a destination within a PDF file.
1630# See section 8.2.1 of the PDF 1.6 reference.
1631# Stability: Added in v1.10, will exist for all v1.x releases.
1632class Destination(DictionaryObject):
1633    def __init__(self, title, page, typ, *args):
1634        DictionaryObject.__init__(self)
1635        self[NameObject("/Title")] = title
1636        self[NameObject("/Page")] = page
1637        self[NameObject("/Type")] = typ
1638
1639        # from table 8.2 of the PDF 1.6 reference.
1640        if typ == "/XYZ":
1641            (self[NameObject("/Left")], self[NameObject("/Top")],
1642                self[NameObject("/Zoom")]) = args
1643        elif typ == "/FitR":
1644            (self[NameObject("/Left")], self[NameObject("/Bottom")],
1645                self[NameObject("/Right")], self[NameObject("/Top")]) = args
1646        elif typ in ["/FitH", "FitBH"]:
1647            self[NameObject("/Top")], = args
1648        elif typ in ["/FitV", "FitBV"]:
1649            self[NameObject("/Left")], = args
1650        elif typ in ["/Fit", "FitB"]:
1651            pass
1652        else:
1653            raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
1654
1655    ##
1656    # Read-only property accessing the destination title.
1657    # @return A string.
1658    title = property(lambda self: self.get("/Title"))
1659
1660    ##
1661    # Read-only property accessing the destination page.
1662    # @return An integer.
1663    page = property(lambda self: self.get("/Page"))
1664
1665    ##
1666    # Read-only property accessing the destination type.
1667    # @return A string.
1668    typ = property(lambda self: self.get("/Type"))
1669
1670    ##
1671    # Read-only property accessing the zoom factor.
1672    # @return A number, or None if not available.
1673    zoom = property(lambda self: self.get("/Zoom", None))
1674
1675    ##
1676    # Read-only property accessing the left horizontal coordinate.
1677    # @return A number, or None if not available.
1678    left = property(lambda self: self.get("/Left", None))
1679
1680    ##
1681    # Read-only property accessing the right horizontal coordinate.
1682    # @return A number, or None if not available.
1683    right = property(lambda self: self.get("/Right", None))
1684
1685    ##
1686    # Read-only property accessing the top vertical coordinate.
1687    # @return A number, or None if not available.
1688    top = property(lambda self: self.get("/Top", None))
1689
1690    ##
1691    # Read-only property accessing the bottom vertical coordinate.
1692    # @return A number, or None if not available.
1693    bottom = property(lambda self: self.get("/Bottom", None))
1694
1695def convertToInt(d, size):
1696    if size > 8:
1697        raise utils.PdfReadError("invalid size in convertToInt")
1698    d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d
1699    d = d[-8:]
1700    return struct.unpack(">q", d)[0]
1701
1702# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
1703_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \
1704        '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \
1705        '\xa9\xfe\x64\x53\x69\x7a'
1706
1707# Implementation of algorithm 3.2 of the PDF standard security handler,
1708# section 3.5.2 of the PDF 1.6 reference.
1709def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
1710    # 1. Pad or truncate the password string to exactly 32 bytes.  If the
1711    # password string is more than 32 bytes long, use only its first 32 bytes;
1712    # if it is less than 32 bytes long, pad it by appending the required number
1713    # of additional bytes from the beginning of the padding string
1714    # (_encryption_padding).
1715    password = (password + _encryption_padding)[:32]
1716    # 2. Initialize the MD5 hash function and pass the result of step 1 as
1717    # input to this function.
1718    import struct
1719    m = md5(password)
1720    # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
1721    # function.
1722    m.update(owner_entry)
1723    # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
1724    # these bytes to the MD5 hash function, low-order byte first.
1725    p_entry = struct.pack('<i', p_entry)
1726    m.update(p_entry)
1727    # 5. Pass the first element of the file's file identifier array to the MD5
1728    # hash function.
1729    m.update(id1_entry)
1730    # 6. (Revision 3 or greater) If document metadata is not being encrypted,
1731    # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
1732    if rev >= 3 and not metadata_encrypt:
1733        m.update("\xff\xff\xff\xff")
1734    # 7. Finish the hash.
1735    md5_hash = m.digest()
1736    # 8. (Revision 3 or greater) Do the following 50 times: Take the output
1737    # from the previous MD5 hash and pass the first n bytes of the output as
1738    # input into a new MD5 hash, where n is the number of bytes of the
1739    # encryption key as defined by the value of the encryption dictionary's
1740    # /Length entry.
1741    if rev >= 3:
1742        for i in range(50):
1743            md5_hash = md5(md5_hash[:keylen]).digest()
1744    # 9. Set the encryption key to the first n bytes of the output from the
1745    # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
1746    # greater, depends on the value of the encryption dictionary's /Length
1747    # entry.
1748    return md5_hash[:keylen]
1749
1750# Implementation of algorithm 3.3 of the PDF standard security handler,
1751# section 3.5.2 of the PDF 1.6 reference.
1752def _alg33(owner_pwd, user_pwd, rev, keylen):
1753    # steps 1 - 4
1754    key = _alg33_1(owner_pwd, rev, keylen)
1755    # 5. Pad or truncate the user password string as described in step 1 of
1756    # algorithm 3.2.
1757    user_pwd = (user_pwd + _encryption_padding)[:32]
1758    # 6. Encrypt the result of step 5, using an RC4 encryption function with
1759    # the encryption key obtained in step 4.
1760    val = utils.RC4_encrypt(key, user_pwd)
1761    # 7. (Revision 3 or greater) Do the following 19 times: Take the output
1762    # from the previous invocation of the RC4 function and pass it as input to
1763    # a new invocation of the function; use an encryption key generated by
1764    # taking each byte of the encryption key obtained in step 4 and performing
1765    # an XOR operation between that byte and the single-byte value of the
1766    # iteration counter (from 1 to 19).
1767    if rev >= 3:
1768        for i in range(1, 20):
1769            new_key = ''
1770            for l in range(len(key)):
1771                new_key += chr(ord(key[l]) ^ i)
1772            val = utils.RC4_encrypt(new_key, val)
1773    # 8. Store the output from the final invocation of the RC4 as the value of
1774    # the /O entry in the encryption dictionary.
1775    return val
1776
1777# Steps 1-4 of algorithm 3.3
1778def _alg33_1(password, rev, keylen):
1779    # 1. Pad or truncate the owner password string as described in step 1 of
1780    # algorithm 3.2.  If there is no owner password, use the user password
1781    # instead.
1782    password = (password + _encryption_padding)[:32]
1783    # 2. Initialize the MD5 hash function and pass the result of step 1 as
1784    # input to this function.
1785    m = md5(password)
1786    # 3. (Revision 3 or greater) Do the following 50 times: Take the output
1787    # from the previous MD5 hash and pass it as input into a new MD5 hash.
1788    md5_hash = m.digest()
1789    if rev >= 3:
1790        for i in range(50):
1791            md5_hash = md5(md5_hash).digest()
1792    # 4. Create an RC4 encryption key using the first n bytes of the output
1793    # from the final MD5 hash, where n is always 5 for revision 2 but, for
1794    # revision 3 or greater, depends on the value of the encryption
1795    # dictionary's /Length entry.
1796    key = md5_hash[:keylen]
1797    return key
1798
1799# Implementation of algorithm 3.4 of the PDF standard security handler,
1800# section 3.5.2 of the PDF 1.6 reference.
1801def _alg34(password, owner_entry, p_entry, id1_entry):
1802    # 1. Create an encryption key based on the user password string, as
1803    # described in algorithm 3.2.
1804    key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
1805    # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
1806    # using an RC4 encryption function with the encryption key from the
1807    # preceding step.
1808    U = utils.RC4_encrypt(key, _encryption_padding)
1809    # 3. Store the result of step 2 as the value of the /U entry in the
1810    # encryption dictionary.
1811    return U, key
1812
1813# Implementation of algorithm 3.4 of the PDF standard security handler,
1814# section 3.5.2 of the PDF 1.6 reference.
1815def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
1816    # 1. Create an encryption key based on the user password string, as
1817    # described in Algorithm 3.2.
1818    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
1819    # 2. Initialize the MD5 hash function and pass the 32-byte padding string
1820    # shown in step 1 of Algorithm 3.2 as input to this function.
1821    m = md5()
1822    m.update(_encryption_padding)
1823    # 3. Pass the first element of the file's file identifier array (the value
1824    # of the ID entry in the document's trailer dictionary; see Table 3.13 on
1825    # page 73) to the hash function and finish the hash.  (See implementation
1826    # note 25 in Appendix H.)
1827    m.update(id1_entry)
1828    md5_hash = m.digest()
1829    # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
1830    # function with the encryption key from step 1.
1831    val = utils.RC4_encrypt(key, md5_hash)
1832    # 5. Do the following 19 times: Take the output from the previous
1833    # invocation of the RC4 function and pass it as input to a new invocation
1834    # of the function; use an encryption key generated by taking each byte of
1835    # the original encryption key (obtained in step 2) and performing an XOR
1836    # operation between that byte and the single-byte value of the iteration
1837    # counter (from 1 to 19).
1838    for i in range(1, 20):
1839        new_key = ''
1840        for l in range(len(key)):
1841            new_key += chr(ord(key[l]) ^ i)
1842        val = utils.RC4_encrypt(new_key, val)
1843    # 6. Append 16 bytes of arbitrary padding to the output from the final
1844    # invocation of the RC4 function and store the 32-byte result as the value
1845    # of the U entry in the encryption dictionary.
1846    # (implementator note: I don't know what "arbitrary padding" is supposed to
1847    # mean, so I have used null bytes.  This seems to match a few other
1848    # people's implementations)
1849    return val + ('\x00' * 16), key
1850
1851#if __name__ == "__main__":
1852#    output = PdfFileWriter()
1853#
1854#    input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb"))
1855#    page1 = input1.getPage(0)
1856#
1857#    input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb"))
1858#    page2 = input2.getPage(0)
1859#    page3 = input2.getPage(1)
1860#    page1.mergePage(page2)
1861#    page1.mergePage(page3)
1862#
1863#    input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb"))
1864#    page1.mergePage(input3.getPage(0))
1865#
1866#    page1.compressContentStreams()
1867#
1868#    output.addPage(page1)
1869#    output.write(file("test\\merge-test.pdf", "wb"))
1870
1871
1872