1# -*- coding: utf-8 -*-
2#
3# vim: sw=4:expandtab:foldmethod=marker
4#
5# Copyright (c) 2006, Mathieu Fenniak
6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
7#
8# All rights reserved.
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright notice,
15# this list of conditions and the following disclaimer.
16# * Redistributions in binary form must reproduce the above copyright notice,
17# this list of conditions and the following disclaimer in the documentation
18# and/or other materials provided with the distribution.
19# * The name of the author may not be used to endorse or promote products
20# derived from this software without specific prior written permission.
21#
22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32# POSSIBILITY OF SUCH DAMAGE.
33
34"""
35A pure-Python PDF library with an increasing number of capabilities.
36See README for links to FAQ, documentation, homepage, etc.
37"""
38
39__author__ = "Mathieu Fenniak"
40__author_email__ = "biziqe@mathieu.fenniak.net"
41
42__maintainer__ = "Phaseit, Inc."
43__maintainer_email = "PyPDF2@phaseit.net"
44
45import string
46import math
47import struct
48import sys
49import uuid
50from sys import version_info
51if version_info < ( 3, 0 ):
52    from cStringIO import StringIO
53else:
54    from io import StringIO
55
56if version_info < ( 3, 0 ):
57    BytesIO = StringIO
58else:
59    from io import BytesIO
60
61from . import filters
62from . import utils
63import warnings
64import codecs
65from .generic import *
66from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
67from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
68
69if version_info < ( 2, 4 ):
70   from sets import ImmutableSet as frozenset
71
72if version_info < ( 2, 5 ):
73    from md5 import md5
74else:
75    from hashlib import md5
76import uuid
77
78
79class PdfFileWriter(object):
80    """
81    This class supports writing PDF files out, given pages produced by another
82    class (typically :class:`PdfFileReader<PdfFileReader>`).
83    """
84    def __init__(self):
85        self._header = b_("%PDF-1.3")
86        self._objects = []  # array of indirect objects
87
88        # The root of our page tree node.
89        pages = DictionaryObject()
90        pages.update({
91                NameObject("/Type"): NameObject("/Pages"),
92                NameObject("/Count"): NumberObject(0),
93                NameObject("/Kids"): ArrayObject(),
94                })
95        self._pages = self._addObject(pages)
96
97        # info object
98        info = DictionaryObject()
99        info.update({
100                NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be'))
101                })
102        self._info = self._addObject(info)
103
104        # root object
105        root = DictionaryObject()
106        root.update({
107            NameObject("/Type"): NameObject("/Catalog"),
108            NameObject("/Pages"): self._pages,
109            })
110        self._root = None
111        self._root_object = root
112
113    def _addObject(self, obj):
114        self._objects.append(obj)
115        return IndirectObject(len(self._objects), 0, self)
116
117    def getObject(self, ido):
118        if ido.pdf != self:
119            raise ValueError("pdf must be self")
120        return self._objects[ido.idnum - 1]
121
122    def _addPage(self, page, action):
123        assert page["/Type"] == "/Page"
124        page[NameObject("/Parent")] = self._pages
125        page = self._addObject(page)
126        pages = self.getObject(self._pages)
127        action(pages["/Kids"], page)
128        pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
129
130    def addPage(self, page):
131        """
132        Adds a page to this PDF file.  The page is usually acquired from a
133        :class:`PdfFileReader<PdfFileReader>` instance.
134
135        :param PageObject page: The page to add to the document. Should be
136            an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
137        """
138        self._addPage(page, list.append)
139
140    def insertPage(self, page, index=0):
141        """
142        Insert a page in this PDF file. The page is usually acquired from a
143        :class:`PdfFileReader<PdfFileReader>` instance.
144
145        :param PageObject page: The page to add to the document.  This
146            argument should be an instance of :class:`PageObject<pdf.PageObject>`.
147        :param int index: Position at which the page will be inserted.
148        """
149        self._addPage(page, lambda l, p: l.insert(index, p))
150
151    def getPage(self, pageNumber):
152        """
153        Retrieves a page by number from this PDF file.
154
155        :param int pageNumber: The page number to retrieve
156            (pages begin at zero)
157        :return: the page at the index given by *pageNumber*
158        :rtype: :class:`PageObject<pdf.PageObject>`
159        """
160        pages = self.getObject(self._pages)
161        # XXX: crude hack
162        return pages["/Kids"][pageNumber].getObject()
163
164    def getNumPages(self):
165        """
166        :return: the number of pages.
167        :rtype: int
168        """
169        pages = self.getObject(self._pages)
170        return int(pages[NameObject("/Count")])
171
172    def addBlankPage(self, width=None, height=None):
173        """
174        Appends a blank page to this PDF file and returns it. If no page size
175        is specified, use the size of the last page.
176
177        :param float width: The width of the new page expressed in default user
178            space units.
179        :param float height: The height of the new page expressed in default
180            user space units.
181        :return: the newly appended page
182        :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
183        :raises PageSizeNotDefinedError: if width and height are not defined
184            and previous page does not exist.
185        """
186        page = PageObject.createBlankPage(self, width, height)
187        self.addPage(page)
188        return page
189
190    def insertBlankPage(self, width=None, height=None, index=0):
191        """
192        Inserts a blank page to this PDF file and returns it. If no page size
193        is specified, use the size of the last page.
194
195        :param float width: The width of the new page expressed in default user
196            space units.
197        :param float height: The height of the new page expressed in default
198            user space units.
199        :param int index: Position to add the page.
200        :return: the newly appended page
201        :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
202        :raises PageSizeNotDefinedError: if width and height are not defined
203            and previous page does not exist.
204        """
205        if width is None or height is None and \
206                (self.getNumPages() - 1) >= index:
207            oldpage = self.getPage(index)
208            width = oldpage.mediaBox.getWidth()
209            height = oldpage.mediaBox.getHeight()
210        page = PageObject.createBlankPage(self, width, height)
211        self.insertPage(page, index)
212        return page
213
214    def addJS(self, javascript):
215        """
216        Add Javascript which will launch upon opening this PDF.
217
218        :param str javascript: Your Javascript.
219
220        >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
221        # Example: This will launch the print window when the PDF is opened.
222        """
223        js = DictionaryObject()
224        js.update({
225                NameObject("/Type"): NameObject("/Action"),
226                NameObject("/S"): NameObject("/JavaScript"),
227                NameObject("/JS"): NameObject("(%s)" % javascript)
228                })
229        js_indirect_object = self._addObject(js)
230
231        # We need a name for parameterized javascript in the pdf file, but it can be anything.
232        js_string_name = str(uuid.uuid4())
233
234        js_name_tree = DictionaryObject()
235        js_name_tree.update({
236                NameObject("/JavaScript"): DictionaryObject({
237                  NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object])
238                })
239              })
240        self._addObject(js_name_tree)
241
242        self._root_object.update({
243                NameObject("/OpenAction"): js_indirect_object,
244                NameObject("/Names"): js_name_tree
245                })
246
247    def addAttachment(self, fname, fdata):
248        """
249        Embed a file inside the PDF.
250
251        :param str fname: The filename to display.
252        :param str fdata: The data in the file.
253
254        Reference:
255        https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
256        Section 7.11.3
257        """
258
259        # We need 3 entries:
260        # * The file's data
261        # * The /Filespec entry
262        # * The file's name, which goes in the Catalog
263
264
265        # The entry for the file
266        """ Sample:
267        8 0 obj
268        <<
269         /Length 12
270         /Type /EmbeddedFile
271        >>
272        stream
273        Hello world!
274        endstream
275        endobj
276        """
277        file_entry = DecodedStreamObject()
278        file_entry.setData(fdata)
279        file_entry.update({
280                NameObject("/Type"): NameObject("/EmbeddedFile")
281                })
282
283        # The Filespec entry
284        """ Sample:
285        7 0 obj
286        <<
287         /Type /Filespec
288         /F (hello.txt)
289         /EF << /F 8 0 R >>
290        >>
291        """
292        efEntry = DictionaryObject()
293        efEntry.update({ NameObject("/F"):file_entry })
294
295        filespec = DictionaryObject()
296        filespec.update({
297                NameObject("/Type"): NameObject("/Filespec"),
298                NameObject("/F"): createStringObject(fname),  # Perhaps also try TextStringObject
299                NameObject("/EF"): efEntry
300                })
301
302        # Then create the entry for the root, as it needs a reference to the Filespec
303        """ Sample:
304        1 0 obj
305        <<
306         /Type /Catalog
307         /Outlines 2 0 R
308         /Pages 3 0 R
309         /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
310        >>
311        endobj
312
313        """
314        embeddedFilesNamesDictionary = DictionaryObject()
315        embeddedFilesNamesDictionary.update({
316                NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
317                })
318
319        embeddedFilesDictionary = DictionaryObject()
320        embeddedFilesDictionary.update({
321                NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
322                })
323        # Update the root
324        self._root_object.update({
325                NameObject("/Names"): embeddedFilesDictionary
326                })
327
328    def appendPagesFromReader(self, reader, after_page_append=None):
329        """
330        Copy pages from reader to writer. Includes an optional callback parameter
331        which is invoked after pages are appended to the writer.
332
333        :param reader: a PdfFileReader object from which to copy page
334            annotations to this writer object.  The writer's annots
335        will then be updated
336        :callback after_page_append (function): Callback function that is invoked after
337            each page is appended to the writer. Callback signature:
338
339            :param writer_pageref (PDF page reference): Reference to the page
340                appended to the writer.
341        """
342        # Get page count from writer and reader
343        reader_num_pages = reader.getNumPages()
344        writer_num_pages = self.getNumPages()
345
346        # Copy pages from reader to writer
347        for rpagenum in range(0, reader_num_pages):
348            reader_page = reader.getPage(rpagenum)
349            self.addPage(reader_page)
350            writer_page = self.getPage(writer_num_pages+rpagenum)
351            # Trigger callback, pass writer page as parameter
352            if callable(after_page_append): after_page_append(writer_page)
353
354    def updatePageFormFieldValues(self, page, fields):
355        '''
356        Update the form field values for a given page from a fields dictionary.
357        Copy field texts and values from fields to page.
358
359        :param page: Page reference from PDF writer where the annotations
360            and field data will be updated.
361        :param fields: a Python dictionary of field names (/T) and text
362            values (/V)
363        '''
364        # Iterate through pages, update field values
365        for j in range(0, len(page['/Annots'])):
366            writer_annot = page['/Annots'][j].getObject()
367            for field in fields:
368                if writer_annot.get('/T') == field:
369                    writer_annot.update({
370                        NameObject("/V"): TextStringObject(fields[field])
371                    })
372
373    def cloneReaderDocumentRoot(self, reader):
374        '''
375        Copy the reader document root to the writer.
376
377        :param reader:  PdfFileReader from the document root should be copied.
378        :callback after_page_append
379        '''
380        self._root_object = reader.trailer['/Root']
381
382    def cloneDocumentFromReader(self, reader, after_page_append=None):
383        '''
384        Create a copy (clone) of a document from a PDF file reader
385
386        :param reader: PDF file reader instance from which the clone
387            should be created.
388        :callback after_page_append (function): Callback function that is invoked after
389            each page is appended to the writer. Signature includes a reference to the
390            appended page (delegates to appendPagesFromReader). Callback signature:
391
392            :param writer_pageref (PDF page reference): Reference to the page just
393                appended to the document.
394        '''
395        self.cloneReaderDocumentRoot(reader)
396        self.appendPagesFromReader(reader, after_page_append)
397
398    def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
399        """
400        Encrypt this PDF file with the PDF Standard encryption handler.
401
402        :param str user_pwd: The "user password", which allows for opening
403            and reading the PDF file with the restrictions provided.
404        :param str owner_pwd: The "owner password", which allows for
405            opening the PDF files without any restrictions.  By default,
406            the owner password is the same as the user password.
407        :param bool use_128bit: flag as to whether to use 128bit
408            encryption.  When false, 40bit encryption will be used.  By default,
409            this flag is on.
410        """
411        import time, random
412        if owner_pwd == None:
413            owner_pwd = user_pwd
414        if use_128bit:
415            V = 2
416            rev = 3
417            keylen = int(128 / 8)
418        else:
419            V = 1
420            rev = 2
421            keylen = int(40 / 8)
422        # permit everything:
423        P = -1
424        O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
425        ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
426        ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
427        self._ID = ArrayObject((ID_1, ID_2))
428        if rev == 2:
429            U, key = _alg34(user_pwd, O, P, ID_1)
430        else:
431            assert rev == 3
432            U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
433        encrypt = DictionaryObject()
434        encrypt[NameObject("/Filter")] = NameObject("/Standard")
435        encrypt[NameObject("/V")] = NumberObject(V)
436        if V == 2:
437            encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
438        encrypt[NameObject("/R")] = NumberObject(rev)
439        encrypt[NameObject("/O")] = ByteStringObject(O)
440        encrypt[NameObject("/U")] = ByteStringObject(U)
441        encrypt[NameObject("/P")] = NumberObject(P)
442        self._encrypt = self._addObject(encrypt)
443        self._encrypt_key = key
444
445    def write(self, stream):
446        """
447        Writes the collection of pages added to this object out as a PDF file.
448
449        :param stream: An object to write the file to.  The object must support
450            the write method and the tell method, similar to a file object.
451        """
452        if hasattr(stream, 'mode') and 'b' not in stream.mode:
453            warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
454        debug = False
455        import struct
456
457        if not self._root:
458            self._root = self._addObject(self._root_object)
459
460        externalReferenceMap = {}
461
462        # PDF objects sometimes have circular references to their /Page objects
463        # inside their object tree (for example, annotations).  Those will be
464        # indirect references to objects that we've recreated in this PDF.  To
465        # address this problem, PageObject's store their original object
466        # reference number, and we add it to the external reference map before
467        # we sweep for indirect references.  This forces self-page-referencing
468        # trees to reference the correct new object location, rather than
469        # copying in a new copy of the page object.
470        for objIndex in range(len(self._objects)):
471            obj = self._objects[objIndex]
472            if isinstance(obj, PageObject) and obj.indirectRef != None:
473                data = obj.indirectRef
474                if data.pdf not in externalReferenceMap:
475                    externalReferenceMap[data.pdf] = {}
476                if data.generation not in externalReferenceMap[data.pdf]:
477                    externalReferenceMap[data.pdf][data.generation] = {}
478                externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
479
480        self.stack = []
481        if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
482        self._sweepIndirectReferences(externalReferenceMap, self._root)
483        del self.stack
484
485        # Begin writing:
486        object_positions = []
487        stream.write(self._header + b_("\n"))
488        for i in range(len(self._objects)):
489            idnum = (i + 1)
490            obj = self._objects[i]
491            object_positions.append(stream.tell())
492            stream.write(b_(str(idnum) + " 0 obj\n"))
493            key = None
494            if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
495                pack1 = struct.pack("<i", i + 1)[:3]
496                pack2 = struct.pack("<i", 0)[:2]
497                key = self._encrypt_key + pack1 + pack2
498                assert len(key) == (len(self._encrypt_key) + 5)
499                md5_hash = md5(key).digest()
500                key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
501            obj.writeToStream(stream, key)
502            stream.write(b_("\nendobj\n"))
503
504        # xref table
505        xref_location = stream.tell()
506        stream.write(b_("xref\n"))
507        stream.write(b_("0 %s\n" % (len(self._objects) + 1)))
508        stream.write(b_("%010d %05d f \n" % (0, 65535)))
509        for offset in object_positions:
510            stream.write(b_("%010d %05d n \n" % (offset, 0)))
511
512        # trailer
513        stream.write(b_("trailer\n"))
514        trailer = DictionaryObject()
515        trailer.update({
516                NameObject("/Size"): NumberObject(len(self._objects) + 1),
517                NameObject("/Root"): self._root,
518                NameObject("/Info"): self._info,
519                })
520        if hasattr(self, "_ID"):
521            trailer[NameObject("/ID")] = self._ID
522        if hasattr(self, "_encrypt"):
523            trailer[NameObject("/Encrypt")] = self._encrypt
524        trailer.writeToStream(stream, None)
525
526        # eof
527        stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
528
529    def addMetadata(self, infos):
530        """
531        Add custom metadata to the output.
532
533        :param dict infos: a Python dictionary where each key is a field
534            and each value is your new metadata.
535        """
536        args = {}
537        for key, value in list(infos.items()):
538            args[NameObject(key)] = createStringObject(value)
539        self.getObject(self._info).update(args)
540
541    def _sweepIndirectReferences(self, externMap, data):
542        debug = False
543        if debug: print((data, "TYPE", data.__class__.__name__))
544        if isinstance(data, DictionaryObject):
545            for key, value in list(data.items()):
546                origvalue = value
547                value = self._sweepIndirectReferences(externMap, value)
548                if isinstance(value, StreamObject):
549                    # a dictionary value is a stream.  streams must be indirect
550                    # objects, so we need to change this value.
551                    value = self._addObject(value)
552                data[key] = value
553            return data
554        elif isinstance(data, ArrayObject):
555            for i in range(len(data)):
556                value = self._sweepIndirectReferences(externMap, data[i])
557                if isinstance(value, StreamObject):
558                    # an array value is a stream.  streams must be indirect
559                    # objects, so we need to change this value
560                    value = self._addObject(value)
561                data[i] = value
562            return data
563        elif isinstance(data, IndirectObject):
564            # internal indirect references are fine
565            if data.pdf == self:
566                if data.idnum in self.stack:
567                    return data
568                else:
569                    self.stack.append(data.idnum)
570                    realdata = self.getObject(data)
571                    self._sweepIndirectReferences(externMap, realdata)
572                    return data
573            else:
574                newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
575                if newobj == None:
576                    try:
577                        newobj = data.pdf.getObject(data)
578                        self._objects.append(None) # placeholder
579                        idnum = len(self._objects)
580                        newobj_ido = IndirectObject(idnum, 0, self)
581                        if data.pdf not in externMap:
582                            externMap[data.pdf] = {}
583                        if data.generation not in externMap[data.pdf]:
584                            externMap[data.pdf][data.generation] = {}
585                        externMap[data.pdf][data.generation][data.idnum] = newobj_ido
586                        newobj = self._sweepIndirectReferences(externMap, newobj)
587                        self._objects[idnum-1] = newobj
588                        return newobj_ido
589                    except ValueError:
590                        # Unable to resolve the Object, returning NullObject instead.
591                        return NullObject()
592                return newobj
593        else:
594            return data
595
596    def getReference(self, obj):
597        idnum = self._objects.index(obj) + 1
598        ref = IndirectObject(idnum, 0, self)
599        assert ref.getObject() == obj
600        return ref
601
602    def getOutlineRoot(self):
603        if '/Outlines' in self._root_object:
604            outline = self._root_object['/Outlines']
605            idnum = self._objects.index(outline) + 1
606            outlineRef = IndirectObject(idnum, 0, self)
607            assert outlineRef.getObject() == outline
608        else:
609            outline = TreeObject()
610            outline.update({ })
611            outlineRef = self._addObject(outline)
612            self._root_object[NameObject('/Outlines')] = outlineRef
613
614        return outline
615
616    def getNamedDestRoot(self):
617        if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject):
618            names = self._root_object['/Names']
619            idnum = self._objects.index(names) + 1
620            namesRef = IndirectObject(idnum, 0, self)
621            assert namesRef.getObject() == names
622            if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject):
623                dests = names['/Dests']
624                idnum = self._objects.index(dests) + 1
625                destsRef = IndirectObject(idnum, 0, self)
626                assert destsRef.getObject() == dests
627                if '/Names' in dests:
628                    nd = dests['/Names']
629                else:
630                    nd = ArrayObject()
631                    dests[NameObject('/Names')] = nd
632            else:
633                dests = DictionaryObject()
634                destsRef = self._addObject(dests)
635                names[NameObject('/Dests')] = destsRef
636                nd = ArrayObject()
637                dests[NameObject('/Names')] = nd
638
639        else:
640            names = DictionaryObject()
641            namesRef = self._addObject(names)
642            self._root_object[NameObject('/Names')] = namesRef
643            dests = DictionaryObject()
644            destsRef = self._addObject(dests)
645            names[NameObject('/Dests')] = destsRef
646            nd = ArrayObject()
647            dests[NameObject('/Names')] = nd
648
649        return nd
650
651    def addBookmarkDestination(self, dest, parent=None):
652        destRef = self._addObject(dest)
653
654        outlineRef = self.getOutlineRoot()
655
656        if parent == None:
657            parent = outlineRef
658
659        parent = parent.getObject()
660        #print parent.__class__.__name__
661        parent.addChild(destRef, self)
662
663        return destRef
664
665    def addBookmarkDict(self, bookmark, parent=None):
666        bookmarkObj = TreeObject()
667        for k, v in list(bookmark.items()):
668            bookmarkObj[NameObject(str(k))] = v
669        bookmarkObj.update(bookmark)
670
671        if '/A' in bookmark:
672            action = DictionaryObject()
673            for k, v in list(bookmark['/A'].items()):
674                action[NameObject(str(k))] = v
675            actionRef = self._addObject(action)
676            bookmarkObj[NameObject('/A')] = actionRef
677
678        bookmarkRef = self._addObject(bookmarkObj)
679
680        outlineRef = self.getOutlineRoot()
681
682        if parent == None:
683            parent = outlineRef
684
685        parent = parent.getObject()
686        parent.addChild(bookmarkRef, self)
687
688        return bookmarkRef
689
690    def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
691        """
692        Add a bookmark to this PDF file.
693
694        :param str title: Title to use for this bookmark.
695        :param int pagenum: Page number this bookmark will point to.
696        :param parent: A reference to a parent bookmark to create nested
697            bookmarks.
698        :param tuple color: Color of the bookmark as a red, green, blue tuple
699            from 0.0 to 1.0
700        :param bool bold: Bookmark is bold
701        :param bool italic: Bookmark is italic
702        :param str fit: The fit of the destination page. See
703            :meth:`addLink()<addLink>` for details.
704        """
705        pageRef = self.getObject(self._pages)['/Kids'][pagenum]
706        action = DictionaryObject()
707        zoomArgs = []
708        for a in args:
709            if a is not None:
710                zoomArgs.append(NumberObject(a))
711            else:
712                zoomArgs.append(NullObject())
713        dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
714        destArray = dest.getDestArray()
715        action.update({
716            NameObject('/D') : destArray,
717            NameObject('/S') : NameObject('/GoTo')
718        })
719        actionRef = self._addObject(action)
720
721        outlineRef = self.getOutlineRoot()
722
723        if parent == None:
724            parent = outlineRef
725
726        bookmark = TreeObject()
727
728        bookmark.update({
729            NameObject('/A'): actionRef,
730            NameObject('/Title'): createStringObject(title),
731        })
732
733        if color is not None:
734            bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])})
735
736        format = 0
737        if italic:
738            format += 1
739        if bold:
740            format += 2
741        if format:
742            bookmark.update({NameObject('/F'): NumberObject(format)})
743
744        bookmarkRef = self._addObject(bookmark)
745
746        parent = parent.getObject()
747        parent.addChild(bookmarkRef, self)
748
749        return bookmarkRef
750
751    def addNamedDestinationObject(self, dest):
752        destRef = self._addObject(dest)
753
754        nd = self.getNamedDestRoot()
755        nd.extend([dest['/Title'], destRef])
756
757        return destRef
758
759    def addNamedDestination(self, title, pagenum):
760        pageRef = self.getObject(self._pages)['/Kids'][pagenum]
761        dest = DictionaryObject()
762        dest.update({
763            NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
764            NameObject('/S') : NameObject('/GoTo')
765        })
766
767        destRef = self._addObject(dest)
768        nd = self.getNamedDestRoot()
769
770        nd.extend([title, destRef])
771
772        return destRef
773
774    def removeLinks(self):
775        """
776        Removes links and annotations from this output.
777        """
778        pages = self.getObject(self._pages)['/Kids']
779        for page in pages:
780            pageRef = self.getObject(page)
781            if "/Annots" in pageRef:
782                del pageRef['/Annots']
783
784    def removeImages(self, ignoreByteStringObject=False):
785        """
786        Removes images from this output.
787
788        :param bool ignoreByteStringObject: optional parameter
789            to ignore ByteString Objects.
790        """
791        pages = self.getObject(self._pages)['/Kids']
792        for j in range(len(pages)):
793            page = pages[j]
794            pageRef = self.getObject(page)
795            content = pageRef['/Contents'].getObject()
796            if not isinstance(content, ContentStream):
797                content = ContentStream(content, pageRef)
798
799            _operations = []
800            seq_graphics = False
801            for operands, operator in content.operations:
802                if operator == b_('Tj'):
803                    text = operands[0]
804                    if ignoreByteStringObject:
805                        if not isinstance(text, TextStringObject):
806                            operands[0] = TextStringObject()
807                elif operator == b_("'"):
808                    text = operands[0]
809                    if ignoreByteStringObject:
810                        if not isinstance(text, TextStringObject):
811                            operands[0] = TextStringObject()
812                elif operator == b_('"'):
813                    text = operands[2]
814                    if ignoreByteStringObject:
815                        if not isinstance(text, TextStringObject):
816                            operands[2] = TextStringObject()
817                elif operator == b_("TJ"):
818                    for i in range(len(operands[0])):
819                        if ignoreByteStringObject:
820                            if not isinstance(operands[0][i], TextStringObject):
821                                operands[0][i] = TextStringObject()
822
823                if operator == b_('q'):
824                    seq_graphics = True
825                if operator == b_('Q'):
826                    seq_graphics = False
827                if seq_graphics:
828                    if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
829                            b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
830                            b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
831                        continue
832                if operator == b_('re'):
833                    continue
834                _operations.append((operands, operator))
835
836            content.operations = _operations
837            pageRef.__setitem__(NameObject('/Contents'), content)
838
839    def removeText(self, ignoreByteStringObject=False):
840        """
841        Removes images from this output.
842
843        :param bool ignoreByteStringObject: optional parameter
844            to ignore ByteString Objects.
845        """
846        pages = self.getObject(self._pages)['/Kids']
847        for j in range(len(pages)):
848            page = pages[j]
849            pageRef = self.getObject(page)
850            content = pageRef['/Contents'].getObject()
851            if not isinstance(content, ContentStream):
852                content = ContentStream(content, pageRef)
853            for operands,operator in content.operations:
854                if operator == b_('Tj'):
855                    text = operands[0]
856                    if not ignoreByteStringObject:
857                        if isinstance(text, TextStringObject):
858                            operands[0] = TextStringObject()
859                    else:
860                        if isinstance(text, TextStringObject) or \
861                                isinstance(text, ByteStringObject):
862                            operands[0] = TextStringObject()
863                elif operator == b_("'"):
864                    text = operands[0]
865                    if not ignoreByteStringObject:
866                        if isinstance(text, TextStringObject):
867                            operands[0] = TextStringObject()
868                    else:
869                        if isinstance(text, TextStringObject) or \
870                                isinstance(text, ByteStringObject):
871                            operands[0] = TextStringObject()
872                elif operator == b_('"'):
873                    text = operands[2]
874                    if not ignoreByteStringObject:
875                        if isinstance(text, TextStringObject):
876                            operands[2] = TextStringObject()
877                    else:
878                        if isinstance(text, TextStringObject) or \
879                                isinstance(text, ByteStringObject):
880                            operands[2] = TextStringObject()
881                elif operator == b_("TJ"):
882                    for i in range(len(operands[0])):
883                        if not ignoreByteStringObject:
884                            if isinstance(operands[0][i], TextStringObject):
885                                operands[0][i] = TextStringObject()
886                        else:
887                            if isinstance(operands[0][i], TextStringObject) or \
888                                    isinstance(operands[0][i], ByteStringObject):
889                                operands[0][i] = TextStringObject()
890
891            pageRef.__setitem__(NameObject('/Contents'), content)
892
893    def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
894        """
895        Add an internal link from a rectangular area to the specified page.
896
897        :param int pagenum: index of the page on which to place the link.
898        :param int pagedest: index of the page to which the link should go.
899        :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
900            integers specifying the clickable rectangular area
901            ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
902        :param border: if provided, an array describing border-drawing
903            properties. See the PDF spec for details. No border will be
904            drawn if this argument is omitted.
905        :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
906            to be supplied. Passing ``None`` will be read as a null value for that coordinate.
907
908        Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
909             /Fit       No additional arguments
910             /XYZ       [left] [top] [zoomFactor]
911             /FitH      [top]
912             /FitV      [left]
913             /FitR      [left] [bottom] [right] [top]
914             /FitB      No additional arguments
915             /FitBH     [top]
916             /FitBV     [left]
917        """
918
919        pageLink = self.getObject(self._pages)['/Kids'][pagenum]
920        pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link
921        pageRef = self.getObject(pageLink)
922
923        if border is not None:
924            borderArr = [NameObject(n) for n in border[:3]]
925            if len(border) == 4:
926                dashPattern = ArrayObject([NameObject(n) for n in border[3]])
927                borderArr.append(dashPattern)
928        else:
929            borderArr = [NumberObject(0)] * 3
930
931        if isString(rect):
932            rect = NameObject(rect)
933        elif isinstance(rect, RectangleObject):
934            pass
935        else:
936            rect = RectangleObject(rect)
937
938        zoomArgs = []
939        for a in args:
940            if a is not None:
941                zoomArgs.append(NumberObject(a))
942            else:
943                zoomArgs.append(NullObject())
944        dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link
945        destArray = dest.getDestArray()
946
947        lnk = DictionaryObject()
948        lnk.update({
949            NameObject('/Type'): NameObject('/Annot'),
950            NameObject('/Subtype'): NameObject('/Link'),
951            NameObject('/P'): pageLink,
952            NameObject('/Rect'): rect,
953            NameObject('/Border'): ArrayObject(borderArr),
954            NameObject('/Dest'): destArray
955        })
956        lnkRef = self._addObject(lnk)
957
958        if "/Annots" in pageRef:
959            pageRef['/Annots'].append(lnkRef)
960        else:
961            pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
962
963    _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
964
965    def getPageLayout(self):
966        """
967        Get the page layout.
968        See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
969
970        :return: Page layout currently being used.
971        :rtype: str, None if not specified
972        """
973        try:
974            return self._root_object['/PageLayout']
975        except KeyError:
976            return None
977
978    def setPageLayout(self, layout):
979        """
980        Set the page layout
981
982        :param str layout: The page layout to be used
983
984        Valid layouts are:
985             /NoLayout        Layout explicitly not specified
986             /SinglePage      Show one page at a time
987             /OneColumn       Show one column at a time
988             /TwoColumnLeft   Show pages in two columns, odd-numbered pages on the left
989             /TwoColumnRight  Show pages in two columns, odd-numbered pages on the right
990             /TwoPageLeft     Show two pages at a time, odd-numbered pages on the left
991             /TwoPageRight    Show two pages at a time, odd-numbered pages on the right
992        """
993        if not isinstance(layout, NameObject):
994            if layout not in self._valid_layouts:
995                warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts)))
996            layout = NameObject(layout)
997        self._root_object.update({NameObject('/PageLayout'): layout})
998
999    pageLayout = property(getPageLayout, setPageLayout)
1000    """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
1001    and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
1002
1003    _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']
1004
1005    def getPageMode(self):
1006        """
1007        Get the page mode.
1008        See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
1009        of valid modes.
1010
1011        :return: Page mode currently being used.
1012        :rtype: str, None if not specified
1013        """
1014        try:
1015            return self._root_object['/PageMode']
1016        except KeyError:
1017            return None
1018
1019    def setPageMode(self, mode):
1020        """
1021        Set the page mode.
1022
1023        :param str mode: The page mode to use.
1024
1025        Valid modes are:
1026            /UseNone         Do not show outlines or thumbnails panels
1027            /UseOutlines     Show outlines (aka bookmarks) panel
1028            /UseThumbs       Show page thumbnails panel
1029            /FullScreen      Fullscreen view
1030            /UseOC           Show Optional Content Group (OCG) panel
1031            /UseAttachments  Show attachments panel
1032        """
1033        if not isinstance(mode, NameObject):
1034            if mode not in self._valid_modes:
1035                warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes)))
1036            mode = NameObject(mode)
1037        self._root_object.update({NameObject('/PageMode'): mode})
1038
1039    pageMode = property(getPageMode, setPageMode)
1040    """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
1041    and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
1042
1043
1044class PdfFileReader(object):
1045    """
1046    Initializes a PdfFileReader object.  This operation can take some time, as
1047    the PDF stream's cross-reference tables are read into memory.
1048
1049    :param stream: A File object or an object that supports the standard read
1050        and seek methods similar to a File object. Could also be a
1051        string representing a path to a PDF file.
1052    :param bool strict: Determines whether user should be warned of all
1053        problems and also causes some correctable problems to be fatal.
1054        Defaults to ``True``.
1055    :param warndest: Destination for logging warnings (defaults to
1056        ``sys.stderr``).
1057    :param bool overwriteWarnings: Determines whether to override Python's
1058        ``warnings.py`` module with a custom implementation (defaults to
1059        ``True``).
1060    """
1061    def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
1062        if overwriteWarnings:
1063            # have to dynamically override the default showwarning since there are no
1064            # public methods that specify the 'file' parameter
1065            def _showwarning(message, category, filename, lineno, file=warndest, line=None):
1066                if file is None:
1067                    file = sys.stderr
1068                try:
1069                    file.write(formatWarning(message, category, filename, lineno, line))
1070                except IOError:
1071                    pass
1072            warnings.showwarning = _showwarning
1073        self.strict = strict
1074        self.flattenedPages = None
1075        self.resolvedObjects = {}
1076        self.xrefIndex = 0
1077        self._pageId2Num = None # map page IndirectRef number to Page Number
1078        if hasattr(stream, 'mode') and 'b' not in stream.mode:
1079            warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
1080        if isString(stream):
1081            fileobj = open(stream, 'rb')
1082            stream = BytesIO(b_(fileobj.read()))
1083            fileobj.close()
1084        self.read(stream)
1085        self.stream = stream
1086
1087        self._override_encryption = False
1088
1089    def getDocumentInfo(self):
1090        """
1091        Retrieves the PDF file's document information dictionary, if it exists.
1092        Note that some PDF files use metadata streams instead of docinfo
1093        dictionaries, and these metadata streams will not be accessed by this
1094        function.
1095
1096        :return: the document information of this PDF file
1097        :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists.
1098        """
1099        if "/Info" not in self.trailer:
1100            return None
1101        obj = self.trailer['/Info']
1102        retval = DocumentInformation()
1103        retval.update(obj)
1104        return retval
1105
1106    documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
1107    """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function."""
1108
1109    def getXmpMetadata(self):
1110        """
1111        Retrieves XMP (Extensible Metadata Platform) data from the PDF document
1112        root.
1113
1114        :return: a :class:`XmpInformation<xmp.XmpInformation>`
1115            instance that can be used to access XMP metadata from the document.
1116        :rtype: :class:`XmpInformation<xmp.XmpInformation>` or
1117            ``None`` if no metadata was found on the document root.
1118        """
1119        try:
1120            self._override_encryption = True
1121            return self.trailer["/Root"].getXmpMetadata()
1122        finally:
1123            self._override_encryption = False
1124
1125    xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
1126    """
1127    Read-only property that accesses the
1128    :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function.
1129    """
1130
1131    def getNumPages(self):
1132        """
1133        Calculates the number of pages in this PDF file.
1134
1135        :return: number of pages
1136        :rtype: int
1137        :raises PdfReadError: if file is encrypted and restrictions prevent
1138            this action.
1139        """
1140
1141        # Flattened pages will not work on an Encrypted PDF;
1142        # the PDF file's page count is used in this case. Otherwise,
1143        # the original method (flattened page count) is used.
1144        if self.isEncrypted:
1145            try:
1146                self._override_encryption = True
1147                self.decrypt('')
1148                return self.trailer["/Root"]["/Pages"]["/Count"]
1149            except:
1150                raise utils.PdfReadError("File has not been decrypted")
1151            finally:
1152                self._override_encryption = False
1153        else:
1154            if self.flattenedPages == None:
1155                self._flatten()
1156            return len(self.flattenedPages)
1157
1158    numPages = property(lambda self: self.getNumPages(), None, None)
1159    """
1160    Read-only property that accesses the
1161    :meth:`getNumPages()<PdfFileReader.getNumPages>` function.
1162    """
1163
1164    def getPage(self, pageNumber):
1165        """
1166        Retrieves a page by number from this PDF file.
1167
1168        :param int pageNumber: The page number to retrieve
1169            (pages begin at zero)
1170        :return: a :class:`PageObject<pdf.PageObject>` instance.
1171        :rtype: :class:`PageObject<pdf.PageObject>`
1172        """
1173        ## ensure that we're not trying to access an encrypted PDF
1174        #assert not self.trailer.has_key("/Encrypt")
1175        if self.flattenedPages == None:
1176            self._flatten()
1177        return self.flattenedPages[pageNumber]
1178
1179    namedDestinations = property(lambda self:
1180                                  self.getNamedDestinations(), None, None)
1181    """
1182    Read-only property that accesses the
1183    :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function.
1184    """
1185
1186    # A select group of relevant field attributes. For the complete list,
1187    # see section 8.6.2 of the PDF 1.7 reference.
1188
1189    def getFields(self, tree = None, retval = None, fileobj = None):
1190        """
1191        Extracts field data if this PDF contains interactive form fields.
1192        The *tree* and *retval* parameters are for recursive use.
1193
1194        :param fileobj: A file object (usually a text file) to write
1195            a report to on all interactive form fields found.
1196        :return: A dictionary where each key is a field name, and each
1197            value is a :class:`Field<PyPDF2.generic.Field>` object. By
1198            default, the mapping name is used for keys.
1199        :rtype: dict, or ``None`` if form data could not be located.
1200        """
1201        fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
1202                       "/T" : "Field Name", "/TU" : "Alternate Field Name",
1203                       "/TM" : "Mapping Name", "/Ff" : "Field Flags",
1204                       "/V" : "Value", "/DV" : "Default Value"}
1205        if retval == None:
1206            retval = {}
1207            catalog = self.trailer["/Root"]
1208            # get the AcroForm tree
1209            if "/AcroForm" in catalog:
1210                tree = catalog["/AcroForm"]
1211            else:
1212                return None
1213        if tree == None:
1214            return retval
1215
1216        self._checkKids(tree, retval, fileobj)
1217        for attr in fieldAttributes:
1218            if attr in tree:
1219                # Tree is a field
1220                self._buildField(tree, retval, fileobj, fieldAttributes)
1221                break
1222
1223        if "/Fields" in tree:
1224            fields = tree["/Fields"]
1225            for f in fields:
1226                field = f.getObject()
1227                self._buildField(field, retval, fileobj, fieldAttributes)
1228
1229        return retval
1230
1231    def _buildField(self, field, retval, fileobj, fieldAttributes):
1232        self._checkKids(field, retval, fileobj)
1233        try:
1234            key = field["/TM"]
1235        except KeyError:
1236            try:
1237                key = field["/T"]
1238            except KeyError:
1239                # Ignore no-name field for now
1240                return
1241        if fileobj:
1242            self._writeField(fileobj, field, fieldAttributes)
1243            fileobj.write("\n")
1244        retval[key] = Field(field)
1245
1246    def _checkKids(self, tree, retval, fileobj):
1247        if "/Kids" in tree:
1248            # recurse down the tree
1249            for kid in tree["/Kids"]:
1250                self.getFields(kid.getObject(), retval, fileobj)
1251
1252    def _writeField(self, fileobj, field, fieldAttributes):
1253        order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
1254        for attr in order:
1255            attrName = fieldAttributes[attr]
1256            try:
1257                if attr == "/FT":
1258                    # Make the field type value more clear
1259                    types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice",
1260                             "/Sig":"Signature"}
1261                    if field[attr] in types:
1262                        fileobj.write(attrName + ": " + types[field[attr]] + "\n")
1263                elif attr == "/Parent":
1264                    # Let's just write the name of the parent
1265                    try:
1266                        name = field["/Parent"]["/TM"]
1267                    except KeyError:
1268                        name = field["/Parent"]["/T"]
1269                    fileobj.write(attrName + ": " + name + "\n")
1270                else:
1271                    fileobj.write(attrName + ": " + str(field[attr]) + "\n")
1272            except KeyError:
1273                # Field attribute is N/A or unknown, so don't write anything
1274                pass
1275
1276    def getFormTextFields(self):
1277        ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
1278        '''
1279        # Retrieve document form fields
1280        formfields = self.getFields()
1281        return dict(
1282            (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \
1283                if formfields[field].get('/FT') == '/Tx'
1284        )
1285
1286    def getNamedDestinations(self, tree=None, retval=None):
1287        """
1288        Retrieves the named destinations present in the document.
1289
1290        :return: a dictionary which maps names to
1291            :class:`Destinations<PyPDF2.generic.Destination>`.
1292        :rtype: dict
1293        """
1294        if retval == None:
1295            retval = {}
1296            catalog = self.trailer["/Root"]
1297
1298            # get the name tree
1299            if "/Dests" in catalog:
1300                tree = catalog["/Dests"]
1301            elif "/Names" in catalog:
1302                names = catalog['/Names']
1303                if "/Dests" in names:
1304                    tree = names['/Dests']
1305
1306        if tree == None:
1307            return retval
1308
1309        if "/Kids" in tree:
1310            # recurse down the tree
1311            for kid in tree["/Kids"]:
1312                self.getNamedDestinations(kid.getObject(), retval)
1313
1314        if "/Names" in tree:
1315            names = tree["/Names"]
1316            for i in range(0, len(names), 2):
1317                key = names[i].getObject()
1318                val = names[i+1].getObject()
1319                if isinstance(val, DictionaryObject) and '/D' in val:
1320                    val = val['/D']
1321                dest = self._buildDestination(key, val)
1322                if dest != None:
1323                    retval[key] = dest
1324
1325        return retval
1326
1327    outlines = property(lambda self: self.getOutlines(), None, None)
1328    """
1329    Read-only property that accesses the
1330        :meth:`getOutlines()<PdfFileReader.getOutlines>` function.
1331    """
1332
1333    def getOutlines(self, node=None, outlines=None):
1334        """
1335        Retrieves the document outline present in the document.
1336
1337        :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
1338        """
1339        if outlines == None:
1340            outlines = []
1341            catalog = self.trailer["/Root"]
1342
1343            # get the outline dictionary and named destinations
1344            if "/Outlines" in catalog:
1345                try:
1346                    lines = catalog["/Outlines"]
1347                except utils.PdfReadError:
1348                    # this occurs if the /Outlines object reference is incorrect
1349                    # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
1350                    # so continue to load the file without the Bookmarks
1351                    return outlines
1352
1353                if "/First" in lines:
1354                    node = lines["/First"]
1355            self._namedDests = self.getNamedDestinations()
1356
1357        if node == None:
1358          return outlines
1359
1360        # see if there are any more outlines
1361        while True:
1362            outline = self._buildOutline(node)
1363            if outline:
1364                outlines.append(outline)
1365
1366            # check for sub-outlines
1367            if "/First" in node:
1368                subOutlines = []
1369                self.getOutlines(node["/First"], subOutlines)
1370                if subOutlines:
1371                    outlines.append(subOutlines)
1372
1373            if "/Next" not in node:
1374                break
1375            node = node["/Next"]
1376
1377        return outlines
1378
1379    def _getPageNumberByIndirect(self, indirectRef):
1380        """Generate _pageId2Num"""
1381        if self._pageId2Num is None:
1382            id2num = {}
1383            for i, x in enumerate(self.pages):
1384                id2num[x.indirectRef.idnum] = i
1385            self._pageId2Num = id2num
1386
1387        if isinstance(indirectRef, int):
1388            idnum = indirectRef
1389        else:
1390            idnum = indirectRef.idnum
1391
1392        ret = self._pageId2Num.get(idnum, -1)
1393        return ret
1394
1395    def getPageNumber(self, page):
1396        """
1397        Retrieve page number of a given PageObject
1398
1399        :param PageObject page: The page to get page number. Should be
1400            an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
1401        :return: the page number or -1 if page not found
1402        :rtype: int
1403        """
1404        indirectRef = page.indirectRef
1405        ret = self._getPageNumberByIndirect(indirectRef)
1406        return ret
1407
1408    def getDestinationPageNumber(self, destination):
1409        """
1410        Retrieve page number of a given Destination object
1411
1412        :param Destination destination: The destination to get page number.
1413             Should be an instance of
1414             :class:`Destination<PyPDF2.pdf.Destination>`
1415        :return: the page number or -1 if page not found
1416        :rtype: int
1417        """
1418        indirectRef = destination.page
1419        ret = self._getPageNumberByIndirect(indirectRef)
1420        return ret
1421
1422    def _buildDestination(self, title, array):
1423        page, typ = array[0:2]
1424        array = array[2:]
1425        return Destination(title, page, typ, *array)
1426
1427    def _buildOutline(self, node):
1428        dest, title, outline = None, None, None
1429
1430        if "/A" in node and "/Title" in node:
1431            # Action, section 8.5 (only type GoTo supported)
1432            title  = node["/Title"]
1433            action = node["/A"]
1434            if action["/S"] == "/GoTo":
1435                dest = action["/D"]
1436        elif "/Dest" in node and "/Title" in node:
1437            # Destination, section 8.2.1
1438            title = node["/Title"]
1439            dest  = node["/Dest"]
1440
1441        # if destination found, then create outline
1442        if dest:
1443            if isinstance(dest, ArrayObject):
1444                outline = self._buildDestination(title, dest)
1445            elif isString(dest) and dest in self._namedDests:
1446                outline = self._namedDests[dest]
1447                outline[NameObject("/Title")] = title
1448            else:
1449                raise utils.PdfReadError("Unexpected destination %r" % dest)
1450        return outline
1451
1452    pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
1453        None, None)
1454    """
1455    Read-only property that emulates a list based upon the
1456    :meth:`getNumPages()<PdfFileReader.getNumPages>` and
1457    :meth:`getPage()<PdfFileReader.getPage>` methods.
1458    """
1459
1460    def getPageLayout(self):
1461        """
1462        Get the page layout.
1463        See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>`
1464        for a description of valid layouts.
1465
1466        :return: Page layout currently being used.
1467        :rtype: ``str``, ``None`` if not specified
1468        """
1469        try:
1470            return self.trailer['/Root']['/PageLayout']
1471        except KeyError:
1472            return None
1473
1474    pageLayout = property(getPageLayout)
1475    """Read-only property accessing the
1476    :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
1477
1478    def getPageMode(self):
1479        """
1480        Get the page mode.
1481        See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
1482        for a description of valid modes.
1483
1484        :return: Page mode currently being used.
1485        :rtype: ``str``, ``None`` if not specified
1486        """
1487        try:
1488            return self.trailer['/Root']['/PageMode']
1489        except KeyError:
1490            return None
1491
1492    pageMode = property(getPageMode)
1493    """Read-only property accessing the
1494    :meth:`getPageMode()<PdfFileReader.getPageMode>` method."""
1495
1496    def _flatten(self, pages=None, inherit=None, indirectRef=None):
1497        inheritablePageAttributes = (
1498            NameObject("/Resources"), NameObject("/MediaBox"),
1499            NameObject("/CropBox"), NameObject("/Rotate")
1500            )
1501        if inherit == None:
1502            inherit = dict()
1503        if pages == None:
1504            self.flattenedPages = []
1505            catalog = self.trailer["/Root"].getObject()
1506            pages = catalog["/Pages"].getObject()
1507
1508        t = "/Pages"
1509        if "/Type" in pages:
1510            t = pages["/Type"]
1511
1512        if t == "/Pages":
1513            for attr in inheritablePageAttributes:
1514                if attr in pages:
1515                    inherit[attr] = pages[attr]
1516            for page in pages["/Kids"]:
1517                addt = {}
1518                if isinstance(page, IndirectObject):
1519                    addt["indirectRef"] = page
1520                self._flatten(page.getObject(), inherit, **addt)
1521        elif t == "/Page":
1522            for attr, value in list(inherit.items()):
1523                # if the page has it's own value, it does not inherit the
1524                # parent's value:
1525                if attr not in pages:
1526                    pages[attr] = value
1527            pageObj = PageObject(self, indirectRef)
1528            pageObj.update(pages)
1529            self.flattenedPages.append(pageObj)
1530
1531    def _getObjectFromStream(self, indirectReference):
1532        # indirect reference to object in object stream
1533        # read the entire object stream into memory
1534        debug = False
1535        stmnum, idx = self.xref_objStm[indirectReference.idnum]
1536        if debug: print(("Here1: %s %s"%(stmnum, idx)))
1537        objStm = IndirectObject(stmnum, 0, self).getObject()
1538        if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
1539        # This is an xref to a stream, so its type better be a stream
1540        assert objStm['/Type'] == '/ObjStm'
1541        # /N is the number of indirect objects in the stream
1542        assert idx < objStm['/N']
1543        streamData = BytesIO(b_(objStm.getData()))
1544        for i in range(objStm['/N']):
1545            readNonWhitespace(streamData)
1546            streamData.seek(-1, 1)
1547            objnum = NumberObject.readFromStream(streamData)
1548            readNonWhitespace(streamData)
1549            streamData.seek(-1, 1)
1550            offset = NumberObject.readFromStream(streamData)
1551            readNonWhitespace(streamData)
1552            streamData.seek(-1, 1)
1553            if objnum != indirectReference.idnum:
1554                # We're only interested in one object
1555                continue
1556            if self.strict and idx != i:
1557                raise utils.PdfReadError("Object is in wrong index.")
1558            streamData.seek(objStm['/First']+offset, 0)
1559            if debug:
1560                pos = streamData.tell()
1561                streamData.seek(0, 0)
1562                lines = streamData.readlines()
1563                for i in range(0, len(lines)):
1564                    print((lines[i]))
1565                streamData.seek(pos, 0)
1566            try:
1567                obj = readObject(streamData, self)
1568            except utils.PdfStreamError as e:
1569                # Stream object cannot be read. Normally, a critical error, but
1570                # Adobe Reader doesn't complain, so continue (in strict mode?)
1571                e = sys.exc_info()[1]
1572                warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
1573                      (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
1574
1575                if self.strict:
1576                    raise utils.PdfReadError("Can't read object stream: %s"%e)
1577                # Replace with null. Hopefully it's nothing important.
1578                obj = NullObject()
1579            return obj
1580
1581        if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
1582        return NullObject()
1583
1584    def getObject(self, indirectReference):
1585        debug = False
1586        if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
1587        retval = self.cacheGetIndirectObject(indirectReference.generation,
1588                                                indirectReference.idnum)
1589        if retval != None:
1590            return retval
1591        if indirectReference.generation == 0 and \
1592                        indirectReference.idnum in self.xref_objStm:
1593            retval = self._getObjectFromStream(indirectReference)
1594        elif indirectReference.generation in self.xref and \
1595                indirectReference.idnum in self.xref[indirectReference.generation]:
1596            start = self.xref[indirectReference.generation][indirectReference.idnum]
1597            if debug: print(("  Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
1598            self.stream.seek(start, 0)
1599            idnum, generation = self.readObjectHeader(self.stream)
1600            if idnum != indirectReference.idnum and self.xrefIndex:
1601                # Xref table probably had bad indexes due to not being zero-indexed
1602                if self.strict:
1603                    raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
1604                                     % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1605                else: pass # xref table is corrected in non-strict mode
1606            elif idnum != indirectReference.idnum:
1607                # some other problem
1608                raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
1609                                         % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1610            assert generation == indirectReference.generation
1611            retval = readObject(self.stream, self)
1612
1613            # override encryption is used for the /Encrypt dictionary
1614            if not self._override_encryption and self.isEncrypted:
1615                # if we don't have the encryption key:
1616                if not hasattr(self, '_decryption_key'):
1617                    raise utils.PdfReadError("file has not been decrypted")
1618                # otherwise, decrypt here...
1619                import struct
1620                pack1 = struct.pack("<i", indirectReference.idnum)[:3]
1621                pack2 = struct.pack("<i", indirectReference.generation)[:2]
1622                key = self._decryption_key + pack1 + pack2
1623                assert len(key) == (len(self._decryption_key) + 5)
1624                md5_hash = md5(key).digest()
1625                key = md5_hash[:min(16, len(self._decryption_key) + 5)]
1626                retval = self._decryptObject(retval, key)
1627        else:
1628            warnings.warn("Object %d %d not defined."%(indirectReference.idnum,
1629                        indirectReference.generation), utils.PdfReadWarning)
1630            #if self.strict:
1631            raise utils.PdfReadError("Could not find object.")
1632        self.cacheIndirectObject(indirectReference.generation,
1633                    indirectReference.idnum, retval)
1634        return retval
1635
1636    def _decryptObject(self, obj, key):
1637        if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
1638            obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
1639        elif isinstance(obj, StreamObject):
1640            obj._data = utils.RC4_encrypt(key, obj._data)
1641        elif isinstance(obj, DictionaryObject):
1642            for dictkey, value in list(obj.items()):
1643                obj[dictkey] = self._decryptObject(value, key)
1644        elif isinstance(obj, ArrayObject):
1645            for i in range(len(obj)):
1646                obj[i] = self._decryptObject(obj[i], key)
1647        return obj
1648
1649    def readObjectHeader(self, stream):
1650        # Should never be necessary to read out whitespace, since the
1651        # cross-reference table should put us in the right spot to read the
1652        # object header.  In reality... some files have stupid cross reference
1653        # tables that are off by whitespace bytes.
1654        extra = False
1655        utils.skipOverComment(stream)
1656        extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1657        idnum = readUntilWhitespace(stream)
1658        extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1659        generation = readUntilWhitespace(stream)
1660        obj = stream.read(3)
1661        readNonWhitespace(stream)
1662        stream.seek(-1, 1)
1663        if (extra and self.strict):
1664            #not a fatal error
1665            warnings.warn("Superfluous whitespace found in object header %s %s" % \
1666                          (idnum, generation), utils.PdfReadWarning)
1667        return int(idnum), int(generation)
1668
1669    def cacheGetIndirectObject(self, generation, idnum):
1670        debug = False
1671        out = self.resolvedObjects.get((generation, idnum))
1672        if debug and out: print(("cache hit: %d %d"%(idnum, generation)))
1673        elif debug: print(("cache miss: %d %d"%(idnum, generation)))
1674        return out
1675
1676    def cacheIndirectObject(self, generation, idnum, obj):
1677        # return None # Sometimes we want to turn off cache for debugging.
1678        if (generation, idnum) in self.resolvedObjects:
1679            msg = "Overwriting cache for %s %s"%(generation, idnum)
1680            if self.strict: raise utils.PdfReadError(msg)
1681            else:           warnings.warn(msg)
1682        self.resolvedObjects[(generation, idnum)] = obj
1683        return obj
1684
1685    def read(self, stream):
1686        debug = False
1687        if debug: print(">>read", stream)
1688        # start at the end:
1689        stream.seek(-1, 2)
1690        if not stream.tell():
1691            raise utils.PdfReadError('Cannot read an empty file')
1692        last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream
1693        line = b_('')
1694        while line[:5] != b_("%%EOF"):
1695            if stream.tell() < last1K:
1696                raise utils.PdfReadError("EOF marker not found")
1697            line = self.readNextEndLine(stream)
1698            if debug: print("  line:",line)
1699
1700        # find startxref entry - the location of the xref table
1701        line = self.readNextEndLine(stream)
1702        try:
1703            startxref = int(line)
1704        except ValueError:
1705            # 'startxref' may be on the same line as the location
1706            if not line.startswith(b_("startxref")):
1707                raise utils.PdfReadError("startxref not found")
1708            startxref = int(line[9:].strip())
1709            warnings.warn("startxref on same line as offset")
1710        else:
1711            line = self.readNextEndLine(stream)
1712            if line[:9] != b_("startxref"):
1713                raise utils.PdfReadError("startxref not found")
1714
1715        # read all cross reference tables and their trailers
1716        self.xref = {}
1717        self.xref_objStm = {}
1718        self.trailer = DictionaryObject()
1719        while True:
1720            # load the xref table
1721            stream.seek(startxref, 0)
1722            x = stream.read(1)
1723            if x == b_("x"):
1724                # standard cross-reference table
1725                ref = stream.read(4)
1726                if ref[:3] != b_("ref"):
1727                    raise utils.PdfReadError("xref table read error")
1728                readNonWhitespace(stream)
1729                stream.seek(-1, 1)
1730                firsttime = True; # check if the first time looking at the xref table
1731                while True:
1732                    num = readObject(stream, self)
1733                    if firsttime and num != 0:
1734                         self.xrefIndex = num
1735                         if self.strict:
1736                            warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning)
1737                            #if table not zero indexed, could be due to error from when PDF was created
1738                            #which will lead to mismatched indices later on, only warned and corrected if self.strict=True
1739                    firsttime = False
1740                    readNonWhitespace(stream)
1741                    stream.seek(-1, 1)
1742                    size = readObject(stream, self)
1743                    readNonWhitespace(stream)
1744                    stream.seek(-1, 1)
1745                    cnt = 0
1746                    while cnt < size:
1747                        line = stream.read(20)
1748
1749                        # It's very clear in section 3.4.3 of the PDF spec
1750                        # that all cross-reference table lines are a fixed
1751                        # 20 bytes (as of PDF 1.7). However, some files have
1752                        # 21-byte entries (or more) due to the use of \r\n
1753                        # (CRLF) EOL's. Detect that case, and adjust the line
1754                        # until it does not begin with a \r (CR) or \n (LF).
1755                        while line[0] in b_("\x0D\x0A"):
1756                            stream.seek(-20 + 1, 1)
1757                            line = stream.read(20)
1758
1759                        # On the other hand, some malformed PDF files
1760                        # use a single character EOL without a preceeding
1761                        # space.  Detect that case, and seek the stream
1762                        # back one character.  (0-9 means we've bled into
1763                        # the next xref entry, t means we've bled into the
1764                        # text "trailer"):
1765                        if line[-1] in b_("0123456789t"):
1766                            stream.seek(-1, 1)
1767
1768                        offset, generation = line[:16].split(b_(" "))
1769                        offset, generation = int(offset), int(generation)
1770                        if generation not in self.xref:
1771                            self.xref[generation] = {}
1772                        if num in self.xref[generation]:
1773                            # It really seems like we should allow the last
1774                            # xref table in the file to override previous
1775                            # ones. Since we read the file backwards, assume
1776                            # any existing key is already set correctly.
1777                            pass
1778                        else:
1779                            self.xref[generation][num] = offset
1780                        cnt += 1
1781                        num += 1
1782                    readNonWhitespace(stream)
1783                    stream.seek(-1, 1)
1784                    trailertag = stream.read(7)
1785                    if trailertag != b_("trailer"):
1786                        # more xrefs!
1787                        stream.seek(-7, 1)
1788                    else:
1789                        break
1790                readNonWhitespace(stream)
1791                stream.seek(-1, 1)
1792                newTrailer = readObject(stream, self)
1793                for key, value in list(newTrailer.items()):
1794                    if key not in self.trailer:
1795                        self.trailer[key] = value
1796                if "/Prev" in newTrailer:
1797                    startxref = newTrailer["/Prev"]
1798                else:
1799                    break
1800            elif x.isdigit():
1801                # PDF 1.5+ Cross-Reference Stream
1802                stream.seek(-1, 1)
1803                idnum, generation = self.readObjectHeader(stream)
1804                xrefstream = readObject(stream, self)
1805                assert xrefstream["/Type"] == "/XRef"
1806                self.cacheIndirectObject(generation, idnum, xrefstream)
1807                streamData = BytesIO(b_(xrefstream.getData()))
1808                # Index pairs specify the subsections in the dictionary. If
1809                # none create one subsection that spans everything.
1810                idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
1811                if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
1812                entrySizes = xrefstream.get("/W")
1813                assert len(entrySizes) >= 3
1814                if self.strict and len(entrySizes) > 3:
1815                    raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
1816
1817                def getEntry(i):
1818                    # Reads the correct number of bytes for each entry. See the
1819                    # discussion of the W parameter in PDF spec table 17.
1820                    if entrySizes[i] > 0:
1821                        d = streamData.read(entrySizes[i])
1822                        return convertToInt(d, entrySizes[i])
1823
1824                    # PDF Spec Table 17: A value of zero for an element in the
1825                    # W array indicates...the default value shall be used
1826                    if i == 0:  return 1 # First value defaults to 1
1827                    else:       return 0
1828
1829                def used_before(num, generation):
1830                    # We move backwards through the xrefs, don't replace any.
1831                    return num in self.xref.get(generation, []) or \
1832                            num in self.xref_objStm
1833
1834                # Iterate through each subsection
1835                last_end = 0
1836                for start, size in self._pairs(idx_pairs):
1837                    # The subsections must increase
1838                    assert start >= last_end
1839                    last_end = start + size
1840                    for num in range(start, start+size):
1841                        # The first entry is the type
1842                        xref_type = getEntry(0)
1843                        # The rest of the elements depend on the xref_type
1844                        if xref_type == 0:
1845                            # linked list of free objects
1846                            next_free_object = getEntry(1)
1847                            next_generation = getEntry(2)
1848                        elif xref_type == 1:
1849                            # objects that are in use but are not compressed
1850                            byte_offset = getEntry(1)
1851                            generation = getEntry(2)
1852                            if generation not in self.xref:
1853                                self.xref[generation] = {}
1854                            if not used_before(num, generation):
1855                                self.xref[generation][num] = byte_offset
1856                                if debug: print(("XREF Uncompressed: %s %s"%(
1857                                                num, generation)))
1858                        elif xref_type == 2:
1859                            # compressed objects
1860                            objstr_num = getEntry(1)
1861                            obstr_idx = getEntry(2)
1862                            generation = 0 # PDF spec table 18, generation is 0
1863                            if not used_before(num, generation):
1864                                if debug: print(("XREF Compressed: %s %s %s"%(
1865                                        num, objstr_num, obstr_idx)))
1866                                self.xref_objStm[num] = (objstr_num, obstr_idx)
1867                        elif self.strict:
1868                            raise utils.PdfReadError("Unknown xref type: %s"%
1869                                                        xref_type)
1870
1871                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
1872                for key in trailerKeys:
1873                    if key in xrefstream and key not in self.trailer:
1874                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
1875                if "/Prev" in xrefstream:
1876                    startxref = xrefstream["/Prev"]
1877                else:
1878                    break
1879            else:
1880                # bad xref character at startxref.  Let's see if we can find
1881                # the xref table nearby, as we've observed this error with an
1882                # off-by-one before.
1883                stream.seek(-11, 1)
1884                tmp = stream.read(20)
1885                xref_loc = tmp.find(b_("xref"))
1886                if xref_loc != -1:
1887                    startxref -= (10 - xref_loc)
1888                    continue
1889                # No explicit xref table, try finding a cross-reference stream.
1890                stream.seek(startxref, 0)
1891                found = False
1892                for look in range(5):
1893                    if stream.read(1).isdigit():
1894                        # This is not a standard PDF, consider adding a warning
1895                        startxref += look
1896                        found = True
1897                        break
1898                if found:
1899                    continue
1900                # no xref table found at specified location
1901                raise utils.PdfReadError("Could not find xref table at specified location")
1902        #if not zero-indexed, verify that the table is correct; change it if necessary
1903        if self.xrefIndex and not self.strict:
1904            loc = stream.tell()
1905            for gen in self.xref:
1906                if gen == 65535: continue
1907                for id in self.xref[gen]:
1908                    stream.seek(self.xref[gen][id], 0)
1909                    try:
1910                        pid, pgen = self.readObjectHeader(stream)
1911                    except ValueError:
1912                        break
1913                    if pid == id - self.xrefIndex:
1914                        self._zeroXref(gen)
1915                        break
1916                    #if not, then either it's just plain wrong, or the non-zero-index is actually correct
1917            stream.seek(loc, 0) #return to where it was
1918
1919    def _zeroXref(self, generation):
1920        self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
1921
1922    def _pairs(self, array):
1923        i = 0
1924        while True:
1925            yield array[i], array[i+1]
1926            i += 2
1927            if (i+1) >= len(array):
1928                break
1929
1930    def readNextEndLine(self, stream):
1931        debug = False
1932        if debug: print(">>readNextEndLine")
1933        line = b_("")
1934        while True:
1935            # Prevent infinite loops in malformed PDFs
1936            if stream.tell() == 0:
1937                raise utils.PdfReadError("Could not read malformed PDF file")
1938            x = stream.read(1)
1939            if debug: print(("  x:", x, "%x"%ord(x)))
1940            if stream.tell() < 2:
1941                raise utils.PdfReadError("EOL marker not found")
1942            stream.seek(-2, 1)
1943            if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
1944                crlf = False
1945                while x == b_('\n') or x == b_('\r'):
1946                    if debug:
1947                        if ord(x) == 0x0D: print("  x is CR 0D")
1948                        elif ord(x) == 0x0A: print("  x is LF 0A")
1949                    x = stream.read(1)
1950                    if x == b_('\n') or x == b_('\r'): # account for CR+LF
1951                        stream.seek(-1, 1)
1952                        crlf = True
1953                    if stream.tell() < 2:
1954                        raise utils.PdfReadError("EOL marker not found")
1955                    stream.seek(-2, 1)
1956                stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
1957                break
1958            else:
1959                if debug: print("  x is neither")
1960                line = x + line
1961                if debug: print(("  RNEL line:", line))
1962        if debug: print("leaving RNEL")
1963        return line
1964
1965    def decrypt(self, password):
1966        """
1967        When using an encrypted / secured PDF file with the PDF Standard
1968        encryption handler, this function will allow the file to be decrypted.
1969        It checks the given password against the document's user password and
1970        owner password, and then stores the resulting decryption key if either
1971        password is correct.
1972
1973        It does not matter which password was matched.  Both passwords provide
1974        the correct decryption key that will allow the document to be used with
1975        this library.
1976
1977        :param str password: The password to match.
1978        :return: ``0`` if the password failed, ``1`` if the password matched the user
1979            password, and ``2`` if the password matched the owner password.
1980        :rtype: int
1981        :raises NotImplementedError: if document uses an unsupported encryption
1982            method.
1983        """
1984
1985        self._override_encryption = True
1986        try:
1987            return self._decrypt(password)
1988        finally:
1989            self._override_encryption = False
1990
1991    def _decrypt(self, password):
1992        encrypt = self.trailer['/Encrypt'].getObject()
1993        if encrypt['/Filter'] != '/Standard':
1994            raise NotImplementedError("only Standard PDF encryption handler is available")
1995        if not (encrypt['/V'] in (1, 2)):
1996            raise NotImplementedError("only algorithm code 1 and 2 are supported")
1997        user_password, key = self._authenticateUserPassword(password)
1998        if user_password:
1999            self._decryption_key = key
2000            return 1
2001        else:
2002            rev = encrypt['/R'].getObject()
2003            if rev == 2:
2004                keylen = 5
2005            else:
2006                keylen = encrypt['/Length'].getObject() // 8
2007            key = _alg33_1(password, rev, keylen)
2008            real_O = encrypt["/O"].getObject()
2009            if rev == 2:
2010                userpass = utils.RC4_encrypt(key, real_O)
2011            else:
2012                val = real_O
2013                for i in range(19, -1, -1):
2014                    new_key = b_('')
2015                    for l in range(len(key)):
2016                        new_key += b_(chr(utils.ord_(key[l]) ^ i))
2017                    val = utils.RC4_encrypt(new_key, val)
2018                userpass = val
2019            owner_password, key = self._authenticateUserPassword(userpass)
2020            if owner_password:
2021                self._decryption_key = key
2022                return 2
2023        return 0
2024
2025    def _authenticateUserPassword(self, password):
2026        encrypt = self.trailer['/Encrypt'].getObject()
2027        rev = encrypt['/R'].getObject()
2028        owner_entry = encrypt['/O'].getObject()
2029        p_entry = encrypt['/P'].getObject()
2030        id_entry = self.trailer['/ID'].getObject()
2031        id1_entry = id_entry[0].getObject()
2032        real_U = encrypt['/U'].getObject().original_bytes
2033        if rev == 2:
2034            U, key = _alg34(password, owner_entry, p_entry, id1_entry)
2035        elif rev >= 3:
2036            U, key = _alg35(password, rev,
2037                    encrypt["/Length"].getObject() // 8, owner_entry,
2038                    p_entry, id1_entry,
2039                    encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
2040            U, real_U = U[:16], real_U[:16]
2041        return U == real_U, key
2042
2043    def getIsEncrypted(self):
2044        return "/Encrypt" in self.trailer
2045
2046    isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
2047    """
2048    Read-only boolean property showing whether this PDF file is encrypted.
2049    Note that this property, if true, will remain true even after the
2050    :meth:`decrypt()<PdfFileReader.decrypt>` method is called.
2051    """
2052
2053
2054def getRectangle(self, name, defaults):
2055    retval = self.get(name)
2056    if isinstance(retval, RectangleObject):
2057        return retval
2058    if retval == None:
2059        for d in defaults:
2060            retval = self.get(d)
2061            if retval != None:
2062                break
2063    if isinstance(retval, IndirectObject):
2064        retval = self.pdf.getObject(retval)
2065    retval = RectangleObject(retval)
2066    setRectangle(self, name, retval)
2067    return retval
2068
2069
2070def setRectangle(self, name, value):
2071    if not isinstance(name, NameObject):
2072        name = NameObject(name)
2073    self[name] = value
2074
2075
2076def deleteRectangle(self, name):
2077    del self[name]
2078
2079
2080def createRectangleAccessor(name, fallback):
2081    return \
2082        property(
2083            lambda self: getRectangle(self, name, fallback),
2084            lambda self, value: setRectangle(self, name, value),
2085            lambda self: deleteRectangle(self, name)
2086            )
2087
2088
2089class PageObject(DictionaryObject):
2090    """
2091    This class represents a single page within a PDF file.  Typically this
2092    object will be created by accessing the
2093    :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the
2094    :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is
2095    also possible to create an empty page with the
2096    :meth:`createBlankPage()<PageObject.createBlankPage>` static method.
2097
2098    :param pdf: PDF file the page belongs to.
2099    :param indirectRef: Stores the original indirect reference to
2100        this object in its source PDF
2101    """
2102    def __init__(self, pdf=None, indirectRef=None):
2103        DictionaryObject.__init__(self)
2104        self.pdf = pdf
2105        self.indirectRef = indirectRef
2106
2107    def createBlankPage(pdf=None, width=None, height=None):
2108        """
2109        Returns a new blank page.
2110        If ``width`` or ``height`` is ``None``, try to get the page size
2111        from the last page of *pdf*.
2112
2113        :param pdf: PDF file the page belongs to
2114        :param float width: The width of the new page expressed in default user
2115            space units.
2116        :param float height: The height of the new page expressed in default user
2117            space units.
2118        :return: the new blank page:
2119        :rtype: :class:`PageObject<PageObject>`
2120        :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
2121            no page
2122        """
2123        page = PageObject(pdf)
2124
2125        # Creates a new page (cf PDF Reference  7.7.3.3)
2126        page.__setitem__(NameObject('/Type'), NameObject('/Page'))
2127        page.__setitem__(NameObject('/Parent'), NullObject())
2128        page.__setitem__(NameObject('/Resources'), DictionaryObject())
2129        if width is None or height is None:
2130            if pdf is not None and pdf.getNumPages() > 0:
2131                lastpage = pdf.getPage(pdf.getNumPages() - 1)
2132                width = lastpage.mediaBox.getWidth()
2133                height = lastpage.mediaBox.getHeight()
2134            else:
2135                raise utils.PageSizeNotDefinedError()
2136        page.__setitem__(NameObject('/MediaBox'),
2137            RectangleObject([0, 0, width, height]))
2138
2139        return page
2140    createBlankPage = staticmethod(createBlankPage)
2141
2142    def rotateClockwise(self, angle):
2143        """
2144        Rotates a page clockwise by increments of 90 degrees.
2145
2146        :param int angle: Angle to rotate the page.  Must be an increment
2147            of 90 deg.
2148        """
2149        assert angle % 90 == 0
2150        self._rotate(angle)
2151        return self
2152
2153    def rotateCounterClockwise(self, angle):
2154        """
2155        Rotates a page counter-clockwise by increments of 90 degrees.
2156
2157        :param int angle: Angle to rotate the page.  Must be an increment
2158            of 90 deg.
2159        """
2160        assert angle % 90 == 0
2161        self._rotate(-angle)
2162        return self
2163
2164    def _rotate(self, angle):
2165        currentAngle = self.get("/Rotate", 0)
2166        self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
2167
2168    def _mergeResources(res1, res2, resource):
2169        newRes = DictionaryObject()
2170        newRes.update(res1.get(resource, DictionaryObject()).getObject())
2171        page2Res = res2.get(resource, DictionaryObject()).getObject()
2172        renameRes = {}
2173        for key in list(page2Res.keys()):
2174            if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key):
2175                newname = NameObject(key + str(uuid.uuid4()))
2176                renameRes[key] = newname
2177                newRes[newname] = page2Res[key]
2178            elif key not in newRes:
2179                newRes[key] = page2Res.raw_get(key)
2180        return newRes, renameRes
2181    _mergeResources = staticmethod(_mergeResources)
2182
2183    def _contentStreamRename(stream, rename, pdf):
2184        if not rename:
2185            return stream
2186        stream = ContentStream(stream, pdf)
2187        for operands, operator in stream.operations:
2188            for i in range(len(operands)):
2189                op = operands[i]
2190                if isinstance(op, NameObject):
2191                    operands[i] = rename.get(op,op)
2192        return stream
2193    _contentStreamRename = staticmethod(_contentStreamRename)
2194
2195    def _pushPopGS(contents, pdf):
2196        # adds a graphics state "push" and "pop" to the beginning and end
2197        # of a content stream.  This isolates it from changes such as
2198        # transformation matricies.
2199        stream = ContentStream(contents, pdf)
2200        stream.operations.insert(0, [[], "q"])
2201        stream.operations.append([[], "Q"])
2202        return stream
2203    _pushPopGS = staticmethod(_pushPopGS)
2204
2205    def _addTransformationMatrix(contents, pdf, ctm):
2206        # adds transformation matrix at the beginning of the given
2207        # contents stream.
2208        a, b, c, d, e, f = ctm
2209        contents = ContentStream(contents, pdf)
2210        contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
2211            FloatObject(c), FloatObject(d), FloatObject(e),
2212            FloatObject(f)], " cm"])
2213        return contents
2214    _addTransformationMatrix = staticmethod(_addTransformationMatrix)
2215
2216    def getContents(self):
2217        """
2218        Accesses the page contents.
2219
2220        :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
2221            ``/Contents`` is optional, as described in PDF Reference  7.7.3.3
2222        """
2223        if "/Contents" in self:
2224            return self["/Contents"].getObject()
2225        else:
2226            return None
2227
2228    def mergePage(self, page2):
2229        """
2230        Merges the content streams of two pages into one.  Resource references
2231        (i.e. fonts) are maintained from both pages.  The mediabox/cropbox/etc
2232        of this page are not altered.  The parameter page's content stream will
2233        be added to the end of this page's content stream, meaning that it will
2234        be drawn after, or "on top" of this page.
2235
2236        :param PageObject page2: The page to be merged into this one. Should be
2237            an instance of :class:`PageObject<PageObject>`.
2238        """
2239        self._mergePage(page2)
2240
2241    def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
2242        # First we work on merging the resource dictionaries.  This allows us
2243        # to find out what symbols in the content streams we might need to
2244        # rename.
2245
2246        newResources = DictionaryObject()
2247        rename = {}
2248        originalResources = self["/Resources"].getObject()
2249        page2Resources = page2["/Resources"].getObject()
2250        newAnnots = ArrayObject()
2251
2252        for page in (self, page2):
2253            if "/Annots" in page:
2254                annots = page["/Annots"]
2255                if isinstance(annots, ArrayObject):
2256                    for ref in annots:
2257                        newAnnots.append(ref)
2258
2259        for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
2260            new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
2261            if new:
2262                newResources[NameObject(res)] = new
2263                rename.update(newrename)
2264
2265        # Combine /ProcSet sets.
2266        newResources[NameObject("/ProcSet")] = ArrayObject(
2267            frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
2268                frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
2269            )
2270        )
2271
2272        newContentArray = ArrayObject()
2273
2274        originalContent = self.getContents()
2275        if originalContent is not None:
2276            newContentArray.append(PageObject._pushPopGS(
2277                  originalContent, self.pdf))
2278
2279        page2Content = page2.getContents()
2280        if page2Content is not None:
2281            if page2transformation is not None:
2282                page2Content = page2transformation(page2Content)
2283            page2Content = PageObject._contentStreamRename(
2284                page2Content, rename, self.pdf)
2285            page2Content = PageObject._pushPopGS(page2Content, self.pdf)
2286            newContentArray.append(page2Content)
2287
2288        # if expanding the page to fit a new page, calculate the new media box size
2289        if expand:
2290            corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
2291                        self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
2292            corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
2293                        page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
2294                        page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
2295                        page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
2296            if ctm is not None:
2297                ctm = [float(x) for x in ctm]
2298                new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)]
2299                new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)]
2300            else:
2301                new_x = corners2[0:8:2]
2302                new_y = corners2[1:8:2]
2303            lowerleft = [min(new_x), min(new_y)]
2304            upperright = [max(new_x), max(new_y)]
2305            lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
2306            upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
2307
2308            self.mediaBox.setLowerLeft(lowerleft)
2309            self.mediaBox.setUpperRight(upperright)
2310
2311        self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
2312        self[NameObject('/Resources')] = newResources
2313        self[NameObject('/Annots')] = newAnnots
2314
2315    def mergeTransformedPage(self, page2, ctm, expand=False):
2316        """
2317        This is similar to mergePage, but a transformation matrix is
2318        applied to the merged stream.
2319
2320        :param PageObject page2: The page to be merged into this one. Should be
2321            an instance of :class:`PageObject<PageObject>`.
2322        :param tuple ctm: a 6-element tuple containing the operands of the
2323            transformation matrix
2324        :param bool expand: Whether the page should be expanded to fit the dimensions
2325            of the page to be merged.
2326        """
2327        self._mergePage(page2, lambda page2Content:
2328            PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
2329
2330    def mergeScaledPage(self, page2, scale, expand=False):
2331        """
2332        This is similar to mergePage, but the stream to be merged is scaled
2333        by appling a transformation matrix.
2334
2335        :param PageObject page2: The page to be merged into this one. Should be
2336            an instance of :class:`PageObject<PageObject>`.
2337        :param float scale: The scaling factor
2338        :param bool expand: Whether the page should be expanded to fit the
2339            dimensions of the page to be merged.
2340        """
2341        # CTM to scale : [ sx 0 0 sy 0 0 ]
2342        return self.mergeTransformedPage(page2, [scale, 0,
2343                                                 0,      scale,
2344                                                 0,      0], expand)
2345
2346    def mergeRotatedPage(self, page2, rotation, expand=False):
2347        """
2348        This is similar to mergePage, but the stream to be merged is rotated
2349        by appling a transformation matrix.
2350
2351        :param PageObject page2: the page to be merged into this one. Should be
2352            an instance of :class:`PageObject<PageObject>`.
2353        :param float rotation: The angle of the rotation, in degrees
2354        :param bool expand: Whether the page should be expanded to fit the
2355            dimensions of the page to be merged.
2356        """
2357        rotation = math.radians(rotation)
2358        return self.mergeTransformedPage(page2,
2359            [math.cos(rotation),  math.sin(rotation),
2360             -math.sin(rotation), math.cos(rotation),
2361             0,                   0], expand)
2362
2363    def mergeTranslatedPage(self, page2, tx, ty, expand=False):
2364        """
2365        This is similar to mergePage, but the stream to be merged is translated
2366        by appling a transformation matrix.
2367
2368        :param PageObject page2: the page to be merged into this one. Should be
2369            an instance of :class:`PageObject<PageObject>`.
2370        :param float tx: The translation on X axis
2371        :param float ty: The translation on Y axis
2372        :param bool expand: Whether the page should be expanded to fit the
2373            dimensions of the page to be merged.
2374        """
2375        return self.mergeTransformedPage(page2, [1,  0,
2376                                                 0,  1,
2377                                                 tx, ty], expand)
2378
2379    def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False):
2380        """
2381        This is similar to mergePage, but the stream to be merged is rotated
2382        and translated by appling a transformation matrix.
2383
2384        :param PageObject page2: the page to be merged into this one. Should be
2385            an instance of :class:`PageObject<PageObject>`.
2386        :param float tx: The translation on X axis
2387        :param float ty: The translation on Y axis
2388        :param float rotation: The angle of the rotation, in degrees
2389        :param bool expand: Whether the page should be expanded to fit the
2390            dimensions of the page to be merged.
2391        """
2392
2393        translation = [[1, 0, 0],
2394                       [0, 1, 0],
2395                       [-tx, -ty, 1]]
2396        rotation = math.radians(rotation)
2397        rotating = [[math.cos(rotation), math.sin(rotation), 0],
2398                    [-math.sin(rotation), math.cos(rotation), 0],
2399                    [0,                  0,                  1]]
2400        rtranslation = [[1, 0, 0],
2401                       [0, 1, 0],
2402                       [tx, ty, 1]]
2403        ctm = utils.matrixMultiply(translation, rotating)
2404        ctm = utils.matrixMultiply(ctm, rtranslation)
2405
2406        return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2407                                                 ctm[1][0], ctm[1][1],
2408                                                 ctm[2][0], ctm[2][1]], expand)
2409
2410    def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False):
2411        """
2412        This is similar to mergePage, but the stream to be merged is rotated
2413        and scaled by appling a transformation matrix.
2414
2415        :param PageObject page2: the page to be merged into this one. Should be
2416            an instance of :class:`PageObject<PageObject>`.
2417        :param float rotation: The angle of the rotation, in degrees
2418        :param float scale: The scaling factor
2419        :param bool expand: Whether the page should be expanded to fit the
2420            dimensions of the page to be merged.
2421        """
2422        rotation = math.radians(rotation)
2423        rotating = [[math.cos(rotation), math.sin(rotation), 0],
2424                    [-math.sin(rotation), math.cos(rotation), 0],
2425                    [0,                  0,                  1]]
2426        scaling = [[scale, 0,    0],
2427                   [0,    scale, 0],
2428                   [0,    0,    1]]
2429        ctm = utils.matrixMultiply(rotating, scaling)
2430
2431        return self.mergeTransformedPage(page2,
2432                                         [ctm[0][0], ctm[0][1],
2433                                          ctm[1][0], ctm[1][1],
2434                                          ctm[2][0], ctm[2][1]], expand)
2435
2436    def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False):
2437        """
2438        This is similar to mergePage, but the stream to be merged is translated
2439        and scaled by appling a transformation matrix.
2440
2441        :param PageObject page2: the page to be merged into this one. Should be
2442            an instance of :class:`PageObject<PageObject>`.
2443        :param float scale: The scaling factor
2444        :param float tx: The translation on X axis
2445        :param float ty: The translation on Y axis
2446        :param bool expand: Whether the page should be expanded to fit the
2447            dimensions of the page to be merged.
2448        """
2449
2450        translation = [[1, 0, 0],
2451                       [0, 1, 0],
2452                       [tx, ty, 1]]
2453        scaling = [[scale, 0,    0],
2454                   [0,    scale, 0],
2455                   [0,    0,    1]]
2456        ctm = utils.matrixMultiply(scaling, translation)
2457
2458        return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2459                                                 ctm[1][0], ctm[1][1],
2460                                                 ctm[2][0], ctm[2][1]], expand)
2461
2462    def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False):
2463        """
2464        This is similar to mergePage, but the stream to be merged is translated,
2465        rotated and scaled by appling a transformation matrix.
2466
2467        :param PageObject page2: the page to be merged into this one. Should be
2468            an instance of :class:`PageObject<PageObject>`.
2469        :param float tx: The translation on X axis
2470        :param float ty: The translation on Y axis
2471        :param float rotation: The angle of the rotation, in degrees
2472        :param float scale: The scaling factor
2473        :param bool expand: Whether the page should be expanded to fit the
2474            dimensions of the page to be merged.
2475        """
2476        translation = [[1, 0, 0],
2477                       [0, 1, 0],
2478                       [tx, ty, 1]]
2479        rotation = math.radians(rotation)
2480        rotating = [[math.cos(rotation), math.sin(rotation), 0],
2481                    [-math.sin(rotation), math.cos(rotation), 0],
2482                    [0,                  0,                  1]]
2483        scaling = [[scale, 0,    0],
2484                   [0,    scale, 0],
2485                   [0,    0,    1]]
2486        ctm = utils.matrixMultiply(rotating, scaling)
2487        ctm = utils.matrixMultiply(ctm, translation)
2488
2489        return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2490                                                 ctm[1][0], ctm[1][1],
2491                                                 ctm[2][0], ctm[2][1]], expand)
2492
2493    ##
2494    # Applys a transformation matrix the page.
2495    #
2496    # @param ctm   A 6 elements tuple containing the operands of the
2497    #              transformation matrix
2498    def addTransformation(self, ctm):
2499        """
2500        Applies a transformation matrix to the page.
2501
2502        :param tuple ctm: A 6-element tuple containing the operands of the
2503            transformation matrix.
2504        """
2505        originalContent = self.getContents()
2506        if originalContent is not None:
2507            newContent = PageObject._addTransformationMatrix(
2508                originalContent, self.pdf, ctm)
2509            newContent = PageObject._pushPopGS(newContent, self.pdf)
2510            self[NameObject('/Contents')] = newContent
2511
2512    def scale(self, sx, sy):
2513        """
2514        Scales a page by the given factors by appling a transformation
2515        matrix to its content and updating the page size.
2516
2517        :param float sx: The scaling factor on horizontal axis.
2518        :param float sy: The scaling factor on vertical axis.
2519        """
2520        self.addTransformation([sx, 0,
2521                                0,  sy,
2522                                0,  0])
2523        self.mediaBox = RectangleObject([
2524            float(self.mediaBox.getLowerLeft_x()) * sx,
2525            float(self.mediaBox.getLowerLeft_y()) * sy,
2526            float(self.mediaBox.getUpperRight_x()) * sx,
2527            float(self.mediaBox.getUpperRight_y()) * sy])
2528        if "/VP" in self:
2529            viewport = self["/VP"]
2530            if isinstance(viewport, ArrayObject):
2531                bbox = viewport[0]["/BBox"]
2532            else:
2533                bbox = viewport["/BBox"]
2534            scaled_bbox = RectangleObject([
2535                float(bbox[0]) * sx,
2536                float(bbox[1]) * sy,
2537                float(bbox[2]) * sx,
2538                float(bbox[3]) * sy])
2539            if isinstance(viewport, ArrayObject):
2540                self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox
2541            else:
2542                self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox
2543
2544    def scaleBy(self, factor):
2545        """
2546        Scales a page by the given factor by appling a transformation
2547        matrix to its content and updating the page size.
2548
2549        :param float factor: The scaling factor (for both X and Y axis).
2550        """
2551        self.scale(factor, factor)
2552
2553    def scaleTo(self, width, height):
2554        """
2555        Scales a page to the specified dimentions by appling a
2556        transformation matrix to its content and updating the page size.
2557
2558        :param float width: The new width.
2559        :param float height: The new heigth.
2560        """
2561        sx = width / float(self.mediaBox.getUpperRight_x() -
2562                      self.mediaBox.getLowerLeft_x ())
2563        sy = height / float(self.mediaBox.getUpperRight_y() -
2564                       self.mediaBox.getLowerLeft_y ())
2565        self.scale(sx, sy)
2566
2567    def compressContentStreams(self):
2568        """
2569        Compresses the size of this page by joining all content streams and
2570        applying a FlateDecode filter.
2571
2572        However, it is possible that this function will perform no action if
2573        content stream compression becomes "automatic" for some reason.
2574        """
2575        content = self.getContents()
2576        if content is not None:
2577            if not isinstance(content, ContentStream):
2578                content = ContentStream(content, self.pdf)
2579            self[NameObject("/Contents")] = content.flateEncode()
2580
2581    def extractText(self):
2582        """
2583        Locate all text drawing commands, in the order they are provided in the
2584        content stream, and extract the text.  This works well for some PDF
2585        files, but poorly for others, depending on the generator used.  This will
2586        be refined in the future.  Do not rely on the order of text coming out of
2587        this function, as it will change if this function is made more
2588        sophisticated.
2589
2590        :return: a unicode string object.
2591        """
2592        text = u_("")
2593        content = self["/Contents"].getObject()
2594        if not isinstance(content, ContentStream):
2595            content = ContentStream(content, self.pdf)
2596        # Note: we check all strings are TextStringObjects.  ByteStringObjects
2597        # are strings where the byte->string encoding was unknown, so adding
2598        # them to the text here would be gibberish.
2599        for operands, operator in content.operations:
2600            if operator == b_("Tj"):
2601                _text = operands[0]
2602                if isinstance(_text, TextStringObject):
2603                    text += _text
2604            elif operator == b_("T*"):
2605                text += "\n"
2606            elif operator == b_("'"):
2607                text += "\n"
2608                _text = operands[0]
2609                if isinstance(_text, TextStringObject):
2610                    text += operands[0]
2611            elif operator == b_('"'):
2612                _text = operands[2]
2613                if isinstance(_text, TextStringObject):
2614                    text += "\n"
2615                    text += _text
2616            elif operator == b_("TJ"):
2617                for i in operands[0]:
2618                    if isinstance(i, TextStringObject):
2619                        text += i
2620                text += "\n"
2621        return text
2622
2623    mediaBox = createRectangleAccessor("/MediaBox", ())
2624    """
2625    A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2626    defining the boundaries of the physical medium on which the page is
2627    intended to be displayed or printed.
2628    """
2629
2630    cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
2631    """
2632    A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2633    defining the visible region of default user space.  When the page is
2634    displayed or printed, its contents are to be clipped (cropped) to this
2635    rectangle and then imposed on the output medium in some
2636    implementation-defined manner.  Default value: same as :attr:`mediaBox<mediaBox>`.
2637    """
2638
2639    bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
2640    """
2641    A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2642    defining the region to which the contents of the page should be clipped
2643    when output in a production enviroment.
2644    """
2645
2646    trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
2647    """
2648    A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2649    defining the intended dimensions of the finished page after trimming.
2650    """
2651
2652    artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
2653    """
2654    A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2655    defining the extent of the page's meaningful content as intended by the
2656    page's creator.
2657    """
2658
2659
2660class ContentStream(DecodedStreamObject):
2661    def __init__(self, stream, pdf):
2662        self.pdf = pdf
2663        self.operations = []
2664        # stream may be a StreamObject or an ArrayObject containing
2665        # multiple StreamObjects to be cat'd together.
2666        stream = stream.getObject()
2667        if isinstance(stream, ArrayObject):
2668            data = b_("")
2669            for s in stream:
2670                data += s.getObject().getData()
2671            stream = BytesIO(b_(data))
2672        else:
2673            stream = BytesIO(b_(stream.getData()))
2674        self.__parseContentStream(stream)
2675
2676    def __parseContentStream(self, stream):
2677        # file("f:\\tmp.txt", "w").write(stream.read())
2678        stream.seek(0, 0)
2679        operands = []
2680        while True:
2681            peek = readNonWhitespace(stream)
2682            if peek == b_('') or ord_(peek) == 0:
2683                break
2684            stream.seek(-1, 1)
2685            if peek.isalpha() or peek == b_("'") or peek == b_('"'):
2686                operator = utils.readUntilRegex(stream,
2687                        NameObject.delimiterPattern, True)
2688                if operator == b_("BI"):
2689                    # begin inline image - a completely different parsing
2690                    # mechanism is required, of course... thanks buddy...
2691                    assert operands == []
2692                    ii = self._readInlineImage(stream)
2693                    self.operations.append((ii, b_("INLINE IMAGE")))
2694                else:
2695                    self.operations.append((operands, operator))
2696                    operands = []
2697            elif peek == b_('%'):
2698                # If we encounter a comment in the content stream, we have to
2699                # handle it here.  Typically, readObject will handle
2700                # encountering a comment -- but readObject assumes that
2701                # following the comment must be the object we're trying to
2702                # read.  In this case, it could be an operator instead.
2703                while peek not in (b_('\r'), b_('\n')):
2704                    peek = stream.read(1)
2705            else:
2706                operands.append(readObject(stream, None))
2707
2708    def _readInlineImage(self, stream):
2709        # begin reading just after the "BI" - begin image
2710        # first read the dictionary of settings.
2711        settings = DictionaryObject()
2712        while True:
2713            tok = readNonWhitespace(stream)
2714            stream.seek(-1, 1)
2715            if tok == b_("I"):
2716                # "ID" - begin of image data
2717                break
2718            key = readObject(stream, self.pdf)
2719            tok = readNonWhitespace(stream)
2720            stream.seek(-1, 1)
2721            value = readObject(stream, self.pdf)
2722            settings[key] = value
2723        # left at beginning of ID
2724        tmp = stream.read(3)
2725        assert tmp[:2] == b_("ID")
2726        data = b_("")
2727        while True:
2728            # Read the inline image, while checking for EI (End Image) operator.
2729            tok = stream.read(1)
2730            if tok == b_("E"):
2731                # Check for End Image
2732                tok2 = stream.read(1)
2733                if tok2 == b_("I"):
2734                    # Data can contain EI, so check for the Q operator.
2735                    tok3 = stream.read(1)
2736                    info = tok + tok2
2737                    # We need to find whitespace between EI and Q.
2738                    has_q_whitespace = False
2739                    while tok3 in utils.WHITESPACES:
2740                        has_q_whitespace = True
2741                        info += tok3
2742                        tok3 = stream.read(1)
2743                    if tok3 == b_("Q") and has_q_whitespace:
2744                        stream.seek(-1, 1)
2745                        break
2746                    else:
2747                        stream.seek(-1,1)
2748                        data += info
2749                else:
2750                    stream.seek(-1, 1)
2751                    data += tok
2752            else:
2753                data += tok
2754        return {"settings": settings, "data": data}
2755
2756    def _getData(self):
2757        newdata = BytesIO()
2758        for operands, operator in self.operations:
2759            if operator == b_("INLINE IMAGE"):
2760                newdata.write(b_("BI"))
2761                dicttext = BytesIO()
2762                operands["settings"].writeToStream(dicttext, None)
2763                newdata.write(dicttext.getvalue()[2:-2])
2764                newdata.write(b_("ID "))
2765                newdata.write(operands["data"])
2766                newdata.write(b_("EI"))
2767            else:
2768                for op in operands:
2769                    op.writeToStream(newdata, None)
2770                    newdata.write(b_(" "))
2771                newdata.write(b_(operator))
2772            newdata.write(b_("\n"))
2773        return newdata.getvalue()
2774
2775    def _setData(self, value):
2776        self.__parseContentStream(BytesIO(b_(value)))
2777
2778    _data = property(_getData, _setData)
2779
2780
2781class DocumentInformation(DictionaryObject):
2782    """
2783    A class representing the basic document metadata provided in a PDF File.
2784    This class is accessible through
2785    :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>`
2786
2787    All text properties of the document metadata have
2788    *two* properties, eg. author and author_raw. The non-raw property will
2789    always return a ``TextStringObject``, making it ideal for a case where
2790    the metadata is being displayed. The raw property can sometimes return
2791    a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
2792    text encoding; this requires additional safety in the caller and
2793    therefore is not as commonly accessed.
2794    """
2795
2796    def __init__(self):
2797        DictionaryObject.__init__(self)
2798
2799    def getText(self, key):
2800        retval = self.get(key, None)
2801        if isinstance(retval, TextStringObject):
2802            return retval
2803        return None
2804
2805    title = property(lambda self: self.getText("/Title"))
2806    """Read-only property accessing the document's **title**.
2807    Returns a unicode string (``TextStringObject``) or ``None``
2808    if the title is not specified."""
2809    title_raw = property(lambda self: self.get("/Title"))
2810    """The "raw" version of title; can return a ``ByteStringObject``."""
2811
2812    author = property(lambda self: self.getText("/Author"))
2813    """Read-only property accessing the document's **author**.
2814    Returns a unicode string (``TextStringObject``) or ``None``
2815    if the author is not specified."""
2816    author_raw = property(lambda self: self.get("/Author"))
2817    """The "raw" version of author; can return a ``ByteStringObject``."""
2818
2819    subject = property(lambda self: self.getText("/Subject"))
2820    """Read-only property accessing the document's **subject**.
2821    Returns a unicode string (``TextStringObject``) or ``None``
2822    if the subject is not specified."""
2823    subject_raw = property(lambda self: self.get("/Subject"))
2824    """The "raw" version of subject; can return a ``ByteStringObject``."""
2825
2826    creator = property(lambda self: self.getText("/Creator"))
2827    """Read-only property accessing the document's **creator**. If the
2828    document was converted to PDF from another format, this is the name of the
2829    application (e.g. OpenOffice) that created the original document from
2830    which it was converted. Returns a unicode string (``TextStringObject``)
2831    or ``None`` if the creator is not specified."""
2832    creator_raw = property(lambda self: self.get("/Creator"))
2833    """The "raw" version of creator; can return a ``ByteStringObject``."""
2834
2835    producer = property(lambda self: self.getText("/Producer"))
2836    """Read-only property accessing the document's **producer**.
2837    If the document was converted to PDF from another format, this is
2838    the name of the application (for example, OSX Quartz) that converted
2839    it to PDF. Returns a unicode string (``TextStringObject``)
2840    or ``None`` if the producer is not specified."""
2841    producer_raw = property(lambda self: self.get("/Producer"))
2842    """The "raw" version of producer; can return a ``ByteStringObject``."""
2843
2844
2845def convertToInt(d, size):
2846    if size > 8:
2847        raise utils.PdfReadError("invalid size in convertToInt")
2848    d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d)
2849    d = d[-8:]
2850    return struct.unpack(">q", d)[0]
2851
2852# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
2853_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
2854        b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
2855        b_('\xa9\xfe\x64\x53\x69\x7a')
2856
2857
2858# Implementation of algorithm 3.2 of the PDF standard security handler,
2859# section 3.5.2 of the PDF 1.6 reference.
2860def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
2861    # 1. Pad or truncate the password string to exactly 32 bytes.  If the
2862    # password string is more than 32 bytes long, use only its first 32 bytes;
2863    # if it is less than 32 bytes long, pad it by appending the required number
2864    # of additional bytes from the beginning of the padding string
2865    # (_encryption_padding).
2866    password = b_((str_(password) + str_(_encryption_padding))[:32])
2867    # 2. Initialize the MD5 hash function and pass the result of step 1 as
2868    # input to this function.
2869    import struct
2870    m = md5(password)
2871    # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
2872    # function.
2873    m.update(owner_entry.original_bytes)
2874    # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
2875    # these bytes to the MD5 hash function, low-order byte first.
2876    p_entry = struct.pack('<i', p_entry)
2877    m.update(p_entry)
2878    # 5. Pass the first element of the file's file identifier array to the MD5
2879    # hash function.
2880    m.update(id1_entry.original_bytes)
2881    # 6. (Revision 3 or greater) If document metadata is not being encrypted,
2882    # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
2883    if rev >= 3 and not metadata_encrypt:
2884        m.update(b_("\xff\xff\xff\xff"))
2885    # 7. Finish the hash.
2886    md5_hash = m.digest()
2887    # 8. (Revision 3 or greater) Do the following 50 times: Take the output
2888    # from the previous MD5 hash and pass the first n bytes of the output as
2889    # input into a new MD5 hash, where n is the number of bytes of the
2890    # encryption key as defined by the value of the encryption dictionary's
2891    # /Length entry.
2892    if rev >= 3:
2893        for i in range(50):
2894            md5_hash = md5(md5_hash[:keylen]).digest()
2895    # 9. Set the encryption key to the first n bytes of the output from the
2896    # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
2897    # greater, depends on the value of the encryption dictionary's /Length
2898    # entry.
2899    return md5_hash[:keylen]
2900
2901
2902# Implementation of algorithm 3.3 of the PDF standard security handler,
2903# section 3.5.2 of the PDF 1.6 reference.
2904def _alg33(owner_pwd, user_pwd, rev, keylen):
2905    # steps 1 - 4
2906    key = _alg33_1(owner_pwd, rev, keylen)
2907    # 5. Pad or truncate the user password string as described in step 1 of
2908    # algorithm 3.2.
2909    user_pwd = b_((user_pwd + str_(_encryption_padding))[:32])
2910    # 6. Encrypt the result of step 5, using an RC4 encryption function with
2911    # the encryption key obtained in step 4.
2912    val = utils.RC4_encrypt(key, user_pwd)
2913    # 7. (Revision 3 or greater) Do the following 19 times: Take the output
2914    # from the previous invocation of the RC4 function and pass it as input to
2915    # a new invocation of the function; use an encryption key generated by
2916    # taking each byte of the encryption key obtained in step 4 and performing
2917    # an XOR operation between that byte and the single-byte value of the
2918    # iteration counter (from 1 to 19).
2919    if rev >= 3:
2920        for i in range(1, 20):
2921            new_key = ''
2922            for l in range(len(key)):
2923                new_key += chr(ord_(key[l]) ^ i)
2924            val = utils.RC4_encrypt(new_key, val)
2925    # 8. Store the output from the final invocation of the RC4 as the value of
2926    # the /O entry in the encryption dictionary.
2927    return val
2928
2929
2930# Steps 1-4 of algorithm 3.3
2931def _alg33_1(password, rev, keylen):
2932    # 1. Pad or truncate the owner password string as described in step 1 of
2933    # algorithm 3.2.  If there is no owner password, use the user password
2934    # instead.
2935    password = b_((password + str_(_encryption_padding))[:32])
2936    # 2. Initialize the MD5 hash function and pass the result of step 1 as
2937    # input to this function.
2938    m = md5(password)
2939    # 3. (Revision 3 or greater) Do the following 50 times: Take the output
2940    # from the previous MD5 hash and pass it as input into a new MD5 hash.
2941    md5_hash = m.digest()
2942    if rev >= 3:
2943        for i in range(50):
2944            md5_hash = md5(md5_hash).digest()
2945    # 4. Create an RC4 encryption key using the first n bytes of the output
2946    # from the final MD5 hash, where n is always 5 for revision 2 but, for
2947    # revision 3 or greater, depends on the value of the encryption
2948    # dictionary's /Length entry.
2949    key = md5_hash[:keylen]
2950    return key
2951
2952
2953# Implementation of algorithm 3.4 of the PDF standard security handler,
2954# section 3.5.2 of the PDF 1.6 reference.
2955def _alg34(password, owner_entry, p_entry, id1_entry):
2956    # 1. Create an encryption key based on the user password string, as
2957    # described in algorithm 3.2.
2958    key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
2959    # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
2960    # using an RC4 encryption function with the encryption key from the
2961    # preceding step.
2962    U = utils.RC4_encrypt(key, _encryption_padding)
2963    # 3. Store the result of step 2 as the value of the /U entry in the
2964    # encryption dictionary.
2965    return U, key
2966
2967
2968# Implementation of algorithm 3.4 of the PDF standard security handler,
2969# section 3.5.2 of the PDF 1.6 reference.
2970def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
2971    # 1. Create an encryption key based on the user password string, as
2972    # described in Algorithm 3.2.
2973    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
2974    # 2. Initialize the MD5 hash function and pass the 32-byte padding string
2975    # shown in step 1 of Algorithm 3.2 as input to this function.
2976    m = md5()
2977    m.update(_encryption_padding)
2978    # 3. Pass the first element of the file's file identifier array (the value
2979    # of the ID entry in the document's trailer dictionary; see Table 3.13 on
2980    # page 73) to the hash function and finish the hash.  (See implementation
2981    # note 25 in Appendix H.)
2982    m.update(id1_entry.original_bytes)
2983    md5_hash = m.digest()
2984    # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
2985    # function with the encryption key from step 1.
2986    val = utils.RC4_encrypt(key, md5_hash)
2987    # 5. Do the following 19 times: Take the output from the previous
2988    # invocation of the RC4 function and pass it as input to a new invocation
2989    # of the function; use an encryption key generated by taking each byte of
2990    # the original encryption key (obtained in step 2) and performing an XOR
2991    # operation between that byte and the single-byte value of the iteration
2992    # counter (from 1 to 19).
2993    for i in range(1, 20):
2994        new_key = b_('')
2995        for l in range(len(key)):
2996            new_key += b_(chr(ord_(key[l]) ^ i))
2997        val = utils.RC4_encrypt(new_key, val)
2998    # 6. Append 16 bytes of arbitrary padding to the output from the final
2999    # invocation of the RC4 function and store the 32-byte result as the value
3000    # of the U entry in the encryption dictionary.
3001    # (implementator note: I don't know what "arbitrary padding" is supposed to
3002    # mean, so I have used null bytes.  This seems to match a few other
3003    # people's implementations)
3004    return val + (b_('\x00') * 16), key
3005