1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# Copyright (C) 2012-2015 Nerijus Mika
4# MIT license -- See LICENSE.txt for details
5
6'''
7The PdfReader class reads an entire PDF file into memory and
8parses the top-level container objects.  (It does not parse
9into streams.)  The object subclasses PdfDict, and the
10document pages are stored in a list in the pages attribute
11of the object.
12'''
13import gc
14import binascii
15import collections
16import itertools
17
18from .errors import PdfParseError, log
19from .tokens import PdfTokens
20from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
21from .uncompress import uncompress
22from . import crypt
23from .py23_diffs import convert_load, convert_store, iteritems
24
25
26class PdfReader(PdfDict):
27
28    def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int):
29        ''' Return a previously loaded indirect object, or create
30            a placeholder for it.
31        '''
32        key = int(objnum), int(gennum)
33        result = self.indirect_objects.get(key)
34        if result is None:
35            self.indirect_objects[key] = result = PdfIndirect(key)
36            self.deferred_objects.add(key)
37            result._loader = self.loadindirect
38        return result
39
40    def readarray(self, source, PdfArray=PdfArray):
41        ''' Found a [ token.  Parse the tokens after that.
42        '''
43        specialget = self.special.get
44        result = []
45        pop = result.pop
46        append = result.append
47
48        for value in source:
49            if value in ']R':
50                if value == ']':
51                    break
52                generation = pop()
53                value = self.findindirect(pop(), generation)
54            else:
55                func = specialget(value)
56                if func is not None:
57                    value = func(source)
58            append(value)
59        return PdfArray(result)
60
61    def readdict(self, source, PdfDict=PdfDict):
62        ''' Found a << token.  Parse the tokens after that.
63        '''
64        specialget = self.special.get
65        result = PdfDict()
66        next = source.next
67
68        tok = next()
69        while tok != '>>':
70            if not tok.startswith('/'):
71                source.error('Expected PDF /name object')
72                tok = next()
73                continue
74            key = tok
75            value = next()
76            func = specialget(value)
77            if func is not None:
78                value = func(source)
79                tok = next()
80            else:
81                tok = next()
82                if value.isdigit() and tok.isdigit():
83                    tok2 = next()
84                    if tok2 != 'R':
85                        source.error('Expected "R" following two integers')
86                        tok = tok2
87                        continue
88                    value = self.findindirect(value, tok)
89                    tok = next()
90            result[key] = value
91        return result
92
93    def empty_obj(self, source, PdfObject=PdfObject):
94        ''' Some silly git put an empty object in the
95            file.  Back up so the caller sees the endobj.
96        '''
97        source.floc = source.tokstart
98
99    def badtoken(self, source):
100        ''' Didn't see that coming.
101        '''
102        source.exception('Unexpected delimiter')
103
104    def findstream(self, obj, tok, source, len=len):
105        ''' Figure out if there is a content stream
106            following an object, and return the start
107            pointer to the content stream if so.
108
109            (We can't read it yet, because we might not
110            know how long it is, because Length might
111            be an indirect object.)
112        '''
113
114        fdata = source.fdata
115        startstream = source.tokstart + len(tok)
116        gotcr = fdata[startstream] == '\r'
117        startstream += gotcr
118        gotlf = fdata[startstream] == '\n'
119        startstream += gotlf
120        if not gotlf:
121            if not gotcr:
122                source.error(r'stream keyword not followed by \n')
123            else:
124                source.warning(r"stream keyword terminated "
125                               r"by \r without \n")
126        return startstream
127
128    def readstream(self, obj, startstream, source, exact_required=False,
129                   streamending='endstream endobj'.split(), int=int):
130        fdata = source.fdata
131        length = int(obj.Length)
132        source.floc = target_endstream = startstream + length
133        endit = source.multiple(2)
134        obj._stream = fdata[startstream:target_endstream]
135        if endit == streamending:
136            return
137
138        if exact_required:
139            source.exception('Expected endstream endobj')
140
141        # The length attribute does not match the distance between the
142        # stream and endstream keywords.
143
144        # TODO:  Extract maxstream from dictionary of object offsets
145        # and use rfind instead of find.
146        maxstream = len(fdata) - 20
147        endstream = fdata.find('endstream', startstream, maxstream)
148        source.floc = startstream
149        room = endstream - startstream
150        if endstream < 0:
151            source.error('Could not find endstream')
152            return
153        if (length == room + 1 and
154                fdata[startstream - 2:startstream] == '\r\n'):
155            source.warning(r"stream keyword terminated by \r without \n")
156            obj._stream = fdata[startstream - 1:target_endstream - 1]
157            return
158        source.floc = endstream
159        if length > room:
160            source.error('stream /Length attribute (%d) appears to '
161                         'be too big (size %d) -- adjusting',
162                         length, room)
163            obj.stream = fdata[startstream:endstream]
164            return
165        if fdata[target_endstream:endstream].rstrip():
166            source.error('stream /Length attribute (%d) appears to '
167                         'be too small (size %d) -- adjusting',
168                         length, room)
169            obj.stream = fdata[startstream:endstream]
170            return
171        endobj = fdata.find('endobj', endstream, maxstream)
172        if endobj < 0:
173            source.error('Could not find endobj after endstream')
174            return
175        if fdata[endstream:endobj].rstrip() != 'endstream':
176            source.error('Unexpected data between endstream and endobj')
177            return
178        source.error('Illegal endstream/endobj combination')
179
180    def loadindirect(self, key, PdfDict=PdfDict,
181                     isinstance=isinstance):
182        result = self.indirect_objects.get(key)
183        if not isinstance(result, PdfIndirect):
184            return result
185        source = self.source
186        offset = int(self.source.obj_offsets.get(key, '0'))
187        if not offset:
188            source.warning("Did not find PDF object %s", key)
189            return None
190
191        # Read the object header and validate it
192        objnum, gennum = key
193        source.floc = offset
194        objid = source.multiple(3)
195        ok = len(objid) == 3
196        ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
197        ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
198        ok = ok and objid[2] == 'obj'
199        if not ok:
200            source.floc = offset
201            source.next()
202            objheader = '%d %d obj' % (objnum, gennum)
203            fdata = source.fdata
204            offset2 = (fdata.find('\n' + objheader) + 1 or
205                       fdata.find('\r' + objheader) + 1)
206            if (not offset2 or
207                    fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
208                source.warning("Expected indirect object '%s'", objheader)
209                return None
210            source.warning("Indirect object %s found at incorrect "
211                           "offset %d (expected offset %d)",
212                           objheader, offset2, offset)
213            source.floc = offset2 + len(objheader)
214
215        # Read the object, and call special code if it starts
216        # an array or dictionary
217        obj = source.next()
218        func = self.special.get(obj)
219        if func is not None:
220            obj = func(source)
221
222        self.indirect_objects[key] = obj
223        self.deferred_objects.remove(key)
224
225        # Mark the object as indirect, and
226        # just return it if it is a simple object.
227        obj.indirect = key
228        tok = source.next()
229        if tok == 'endobj':
230            return obj
231
232        # Should be a stream.  Either that or it's broken.
233        isdict = isinstance(obj, PdfDict)
234        if isdict and tok == 'stream':
235            self.readstream(obj, self.findstream(obj, tok, source), source)
236            return obj
237
238        # Houston, we have a problem, but let's see if it
239        # is easily fixable.  Leaving out a space before endobj
240        # is apparently an easy mistake to make on generation
241        # (Because it won't be noticed unless you are specifically
242        # generating an indirect object that doesn't end with any
243        # sort of delimiter.)  It is so common that things like
244        # okular just handle it.
245
246        if isinstance(obj, PdfObject) and obj.endswith('endobj'):
247            source.error('No space or delimiter before endobj')
248            obj = PdfObject(obj[:-6])
249        else:
250            source.error("Expected 'endobj'%s token",
251                         isdict and " or 'stream'" or '')
252            obj = PdfObject('')
253
254        obj.indirect = key
255        self.indirect_objects[key] = obj
256        return obj
257
258    def read_all(self):
259        deferred = self.deferred_objects
260        prev = set()
261        while 1:
262            new = deferred - prev
263            if not new:
264                break
265            prev |= deferred
266            for key in new:
267                self.loadindirect(key)
268
269    def decrypt_all(self):
270        self.read_all()
271
272        if self.crypt_filters is not None:
273            crypt.decrypt_objects(
274                self.indirect_objects.values(), self.stream_crypt_filter,
275                self.crypt_filters)
276
277    def uncompress(self):
278        self.read_all()
279
280        uncompress(self.indirect_objects.values())
281
282    def load_stream_objects(self, object_streams):
283        # read object streams
284        objs = []
285        for num in object_streams:
286            obj = self.findindirect(num, 0).real_value()
287            assert obj.Type == '/ObjStm'
288            objs.append(obj)
289
290        # read objects from stream
291        if objs:
292            # Decrypt
293            if self.crypt_filters is not None:
294                crypt.decrypt_objects(
295                    objs, self.stream_crypt_filter, self.crypt_filters)
296
297            # Decompress
298            uncompress(objs)
299
300            for obj in objs:
301                objsource = PdfTokens(obj.stream, 0, False)
302                next = objsource.next
303                offsets = []
304                firstoffset = int(obj.First)
305                while objsource.floc < firstoffset:
306                    offsets.append((int(next()), firstoffset + int(next())))
307                for num, offset in offsets:
308                    # Read the object, and call special code if it starts
309                    # an array or dictionary
310                    objsource.floc = offset
311                    sobj = next()
312                    func = self.special.get(sobj)
313                    if func is not None:
314                        sobj = func(objsource)
315
316                    key = (num, 0)
317                    self.indirect_objects[key] = sobj
318                    if key in self.deferred_objects:
319                        self.deferred_objects.remove(key)
320
321                    # Mark the object as indirect, and
322                    # add it to the list of streams if it starts a stream
323                    sobj.indirect = key
324
325    def findxref(self, fdata):
326        ''' Find the cross reference section at the end of a file
327        '''
328        startloc = fdata.rfind('startxref')
329        if startloc < 0:
330            raise PdfParseError('Did not find "startxref" at end of file')
331        source = PdfTokens(fdata, startloc, False, self.verbose)
332        tok = source.next()
333        assert tok == 'startxref'  # (We just checked this...)
334        tableloc = source.next_default()
335        if not tableloc.isdigit():
336            source.exception('Expected table location')
337        if source.next_default().rstrip().lstrip('%') != 'EOF':
338            source.exception('Expected %%EOF')
339        return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose)
340
341    def parse_xref_stream(self, source, int=int, range=range,
342                          enumerate=enumerate, islice=itertools.islice,
343                          defaultdict=collections.defaultdict,
344                          hexlify=binascii.hexlify):
345        ''' Parse (one of) the cross-reference file section(s)
346        '''
347
348        def readint(s, lengths):
349            offset = 0
350            for length in itertools.cycle(lengths):
351                next = offset + length
352                yield int(hexlify(s[offset:next]), 16) if length else None
353                offset = next
354
355        setdefault = source.obj_offsets.setdefault
356        next = source.next
357        # check for xref stream object
358        objid = source.multiple(3)
359        ok = len(objid) == 3
360        ok = ok and objid[0].isdigit()
361        ok = ok and objid[1] == 'obj'
362        ok = ok and objid[2] == '<<'
363        if not ok:
364            source.exception('Expected xref stream start')
365        obj = self.readdict(source)
366        if obj.Type != PdfName.XRef:
367            source.exception('Expected dict type of /XRef')
368        tok = next()
369        self.readstream(obj, self.findstream(obj, tok, source), source, True)
370        old_strm = obj.stream
371        if not uncompress([obj], True):
372            source.exception('Could not decompress Xref stream')
373        stream = obj.stream
374        # Fix for issue #76 -- goofy compressed xref stream
375        # that is NOT ACTUALLY COMPRESSED
376        stream = stream if stream is not old_strm else convert_store(old_strm)
377        num_pairs = obj.Index or PdfArray(['0', obj.Size])
378        num_pairs = [int(x) for x in num_pairs]
379        num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
380        entry_sizes = [int(x) for x in obj.W]
381        if len(entry_sizes) != 3:
382            source.exception('Invalid entry size')
383        object_streams = defaultdict(list)
384        get = readint(stream, entry_sizes)
385        for objnum, size in num_pairs:
386            for cnt in range(size):
387                xtype, p1, p2 = islice(get, 3)
388                if xtype in (1, None):
389                    if p1:
390                        setdefault((objnum, p2 or 0), p1)
391                elif xtype == 2:
392                    object_streams[p1].append((objnum, p2))
393                objnum += 1
394
395        obj.private.object_streams = object_streams
396        return obj
397
398    def parse_xref_table(self, source, int=int, range=range):
399        ''' Parse (one of) the cross-reference file section(s)
400        '''
401        setdefault = source.obj_offsets.setdefault
402        next = source.next
403        # plain xref table
404        start = source.floc
405        try:
406            while 1:
407                tok = next()
408                if tok == 'trailer':
409                    return
410                startobj = int(tok)
411                for objnum in range(startobj, startobj + int(next())):
412                    offset = int(next())
413                    generation = int(next())
414                    inuse = next()
415                    if inuse == 'n':
416                        if offset != 0:
417                            setdefault((objnum, generation), offset)
418                    elif inuse != 'f':
419                        raise ValueError
420        except:
421            pass
422        try:
423            # Table formatted incorrectly.
424            # See if we can figure it out anyway.
425            end = source.fdata.rindex('trailer', start)
426            table = source.fdata[start:end].splitlines()
427            for line in table:
428                tokens = line.split()
429                if len(tokens) == 2:
430                    objnum = int(tokens[0])
431                elif len(tokens) == 3:
432                    offset, generation, inuse = (int(tokens[0]),
433                                                 int(tokens[1]), tokens[2])
434                    if offset != 0 and inuse == 'n':
435                        setdefault((objnum, generation), offset)
436                    objnum += 1
437                elif tokens:
438                    log.error('Invalid line in xref table: %s' %
439                              repr(line))
440                    raise ValueError
441            log.warning('Badly formatted xref table')
442            source.floc = end
443            next()
444        except:
445            source.floc = start
446            source.exception('Invalid table format')
447
448    def parsexref(self, source):
449        ''' Parse (one of) the cross-reference file section(s)
450        '''
451        next = source.next
452        try:
453            tok = next()
454        except StopIteration:
455            tok = ''
456        if tok.isdigit():
457            return self.parse_xref_stream(source), True
458        elif tok == 'xref':
459            self.parse_xref_table(source)
460            tok = next()
461            if tok != '<<':
462                source.exception('Expected "<<" starting catalog')
463            return self.readdict(source), False
464        else:
465            source.exception('Expected "xref" keyword or xref stream object')
466
467    def readpages(self, node):
468        pagename = PdfName.Page
469        pagesname = PdfName.Pages
470        catalogname = PdfName.Catalog
471        typename = PdfName.Type
472        kidname = PdfName.Kids
473
474        try:
475            result = []
476            stack = [node]
477            append = result.append
478            pop = stack.pop
479            while stack:
480                node = pop()
481                nodetype = node[typename]
482                if nodetype == pagename:
483                    append(node)
484                elif nodetype == pagesname:
485                    stack.extend(reversed(node[kidname]))
486                elif nodetype == catalogname:
487                    stack.append(node[pagesname])
488                else:
489                    log.error('Expected /Page or /Pages dictionary, got %s' %
490                            repr(node))
491            return result
492        except (AttributeError, TypeError) as s:
493            log.error('Invalid page tree: %s' % s)
494            return []
495
496    def _parse_encrypt_info(self, source, password, trailer):
497        """Check password and initialize crypt filters."""
498        # Create and check password key
499        key = crypt.create_key(password, trailer)
500
501        if not crypt.check_user_password(key, trailer):
502            source.warning('User password does not validate')
503
504        # Create default crypt filters
505        private = self.private
506        crypt_filters = self.crypt_filters
507        version = int(trailer.Encrypt.V or 0)
508        if version in (1, 2):
509            crypt_filter = crypt.RC4CryptFilter(key)
510            private.stream_crypt_filter = crypt_filter
511            private.string_crypt_filter = crypt_filter
512        elif version == 4:
513            if PdfName.CF in trailer.Encrypt:
514                for name, params in iteritems(trailer.Encrypt.CF):
515                    if name == PdfName.Identity:
516                        continue
517
518                    cfm = params.CFM
519                    if cfm == PdfName.AESV2:
520                        crypt_filters[name] = crypt.AESCryptFilter(key)
521                    elif cfm == PdfName.V2:
522                        crypt_filters[name] = crypt.RC4CryptFilter(key)
523                    else:
524                        source.warning(
525                            'Unsupported crypt filter: {}, {}'.format(
526                                name, cfm))
527
528            # Read default stream filter
529            if PdfName.StmF in trailer.Encrypt:
530                name = trailer.Encrypt.StmF
531                if name in crypt_filters:
532                    private.stream_crypt_filter = crypt_filters[name]
533                else:
534                    source.warning(
535                        'Invalid crypt filter name in /StmF:'
536                        ' {}'.format(name))
537
538            # Read default string filter
539            if PdfName.StrF in trailer.Encrypt:
540                name = trailer.Encrypt.StrF
541                if name in crypt_filters:
542                    private.string_crypt_filter = crypt_filters[name]
543                else:
544                    source.warning(
545                        'Invalid crypt filter name in /StrF:'
546                        ' {}'.format(name))
547        else:
548            source.warning(
549                'Unsupported Encrypt version: {}'.format(version))
550
551    def __init__(self, fname=None, fdata=None, decompress=False,
552                 decrypt=False, password='', disable_gc=True, verbose=True):
553        self.private.verbose = verbose
554
555        # Runs a lot faster with GC off.
556        disable_gc = disable_gc and gc.isenabled()
557        if disable_gc:
558            gc.disable()
559
560        try:
561            if fname is not None:
562                assert fdata is None
563                # Allow reading preexisting streams like pyPdf
564                if hasattr(fname, 'read'):
565                    fdata = fname.read()
566                else:
567                    try:
568                        f = open(fname, 'rb')
569                        fdata = f.read()
570                        f.close()
571                    except IOError:
572                        raise PdfParseError('Could not read PDF file %s' %
573                                            fname)
574
575            assert fdata is not None
576            fdata = convert_load(fdata)
577
578            if not fdata.startswith('%PDF-'):
579                startloc = fdata.find('%PDF-')
580                if startloc >= 0:
581                    log.warning('PDF header not at beginning of file')
582                else:
583                    lines = fdata.lstrip().splitlines()
584                    if not lines:
585                        raise PdfParseError('Empty PDF file!')
586                    raise PdfParseError('Invalid PDF header: %s' %
587                                        repr(lines[0]))
588
589            self.private.version = fdata[5:8]
590
591            endloc = fdata.rfind('%EOF')
592            if endloc < 0:
593                raise PdfParseError('EOF mark not found: %s' %
594                                    repr(fdata[-20:]))
595            endloc += 6
596            junk = fdata[endloc:]
597            fdata = fdata[:endloc]
598            if junk.rstrip('\00').strip():
599                log.warning('Extra data at end of file')
600
601            private = self.private
602            private.indirect_objects = {}
603            private.deferred_objects = set()
604            private.special = {'<<': self.readdict,
605                               '[': self.readarray,
606                               'endobj': self.empty_obj,
607                               }
608            for tok in r'\ ( ) < > { } ] >> %'.split():
609                self.special[tok] = self.badtoken
610
611            startloc, source = self.findxref(fdata)
612            private.source = source
613
614            # Find all the xref tables/streams, and
615            # then deal with them backwards.
616            xref_list = []
617            while 1:
618                source.obj_offsets = {}
619                trailer, is_stream = self.parsexref(source)
620                prev = trailer.Prev
621                if prev is None:
622                    token = source.next()
623                    if token != 'startxref' and not xref_list:
624                        source.warning('Expected "startxref" '
625                                       'at end of xref table')
626                    break
627                xref_list.append((source.obj_offsets, trailer, is_stream))
628                source.floc = int(prev)
629
630            # Handle document encryption
631            private.crypt_filters = None
632            if decrypt and PdfName.Encrypt in trailer:
633                identity_filter = crypt.IdentityCryptFilter()
634                crypt_filters = {
635                    PdfName.Identity: identity_filter
636                }
637                private.crypt_filters = crypt_filters
638                private.stream_crypt_filter = identity_filter
639                private.string_crypt_filter = identity_filter
640
641                if not crypt.HAS_CRYPTO:
642                    raise PdfParseError(
643                        'Install PyCrypto to enable encryption support')
644
645                self._parse_encrypt_info(source, password, trailer)
646
647            if is_stream:
648                self.load_stream_objects(trailer.object_streams)
649
650            while xref_list:
651                later_offsets, later_trailer, is_stream = xref_list.pop()
652                source.obj_offsets.update(later_offsets)
653                if is_stream:
654                    trailer.update(later_trailer)
655                    self.load_stream_objects(later_trailer.object_streams)
656                else:
657                    trailer = later_trailer
658
659            trailer.Prev = None
660
661            if (trailer.Version and
662                    float(trailer.Version) > float(self.version)):
663                self.private.version = trailer.Version
664
665            if decrypt:
666                self.decrypt_all()
667                trailer.Encrypt = None
668
669            if is_stream:
670                self.Root = trailer.Root
671                self.Info = trailer.Info
672                self.ID = trailer.ID
673                self.Size = trailer.Size
674                self.Encrypt = trailer.Encrypt
675            else:
676                self.update(trailer)
677
678            # self.read_all_indirect(source)
679            private.pages = self.readpages(self.Root)
680            if decompress:
681                self.uncompress()
682
683            # For compatibility with pyPdf
684            private.numPages = len(self.pages)
685        finally:
686            if disable_gc:
687                gc.enable()
688
689    # For compatibility with pyPdf
690    def getPage(self, pagenum):
691        return self.pages[pagenum]
692