1import zlib
2import logging
3from .lzw import lzwdecode
4from .ascii85 import ascii85decode
5from .ascii85 import asciihexdecode
6from .runlength import rldecode
7from .ccitt import ccittfaxdecode
8from .psparser import PSException
9from .psparser import PSObject
10from .psparser import LIT
11from . import settings
12from .utils import apply_png_predictor
13from .utils import isnumber
14
15
16log = logging.getLogger(__name__)
17
18LITERAL_CRYPT = LIT('Crypt')
19
20# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
21LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
22LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
23LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
24LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
25LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
26LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
27LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
28LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
29
30
31class PDFObject(PSObject):
32    pass
33
34
35class PDFException(PSException):
36    pass
37
38
39class PDFTypeError(PDFException):
40    pass
41
42
43class PDFValueError(PDFException):
44    pass
45
46
47class PDFObjectNotFound(PDFException):
48    pass
49
50
51class PDFNotImplementedError(PDFException):
52    pass
53
54
55class PDFObjRef(PDFObject):
56
57    def __init__(self, doc, objid, _):
58        if objid == 0:
59            if settings.STRICT:
60                raise PDFValueError('PDF object id cannot be 0.')
61        self.doc = doc
62        self.objid = objid
63        return
64
65    def __repr__(self):
66        return '<PDFObjRef:%d>' % (self.objid)
67
68    def resolve(self, default=None):
69        try:
70            return self.doc.getobj(self.objid)
71        except PDFObjectNotFound:
72            return default
73
74
75def resolve1(x, default=None):
76    """Resolves an object.
77
78    If this is an array or dictionary, it may still contains
79    some indirect objects inside.
80    """
81    while isinstance(x, PDFObjRef):
82        x = x.resolve(default=default)
83    return x
84
85
86def resolve_all(x, default=None):
87    """Recursively resolves the given object and all the internals.
88
89    Make sure there is no indirect reference within the nested object.
90    This procedure might be slow.
91    """
92    while isinstance(x, PDFObjRef):
93        x = x.resolve(default=default)
94    if isinstance(x, list):
95        x = [resolve_all(v, default=default) for v in x]
96    elif isinstance(x, dict):
97        for (k, v) in x.items():
98            x[k] = resolve_all(v, default=default)
99    return x
100
101
102def decipher_all(decipher, objid, genno, x):
103    """Recursively deciphers the given object.
104    """
105    if isinstance(x, bytes):
106        return decipher(objid, genno, x)
107    if isinstance(x, list):
108        x = [decipher_all(decipher, objid, genno, v) for v in x]
109    elif isinstance(x, dict):
110        for (k, v) in x.items():
111            x[k] = decipher_all(decipher, objid, genno, v)
112    return x
113
114
115def int_value(x):
116    x = resolve1(x)
117    if not isinstance(x, int):
118        if settings.STRICT:
119            raise PDFTypeError('Integer required: %r' % x)
120        return 0
121    return x
122
123
124def float_value(x):
125    x = resolve1(x)
126    if not isinstance(x, float):
127        if settings.STRICT:
128            raise PDFTypeError('Float required: %r' % x)
129        return 0.0
130    return x
131
132
133def num_value(x):
134    x = resolve1(x)
135    if not isnumber(x):
136        if settings.STRICT:
137            raise PDFTypeError('Int or Float required: %r' % x)
138        return 0
139    return x
140
141
142def uint_value(x, n_bits):
143    """Resolve number and interpret it as a two's-complement unsigned number"""
144    x = int_value(x)
145    if x > 0:
146        return x
147    else:
148        return x + 2**n_bits
149
150
151def str_value(x):
152    x = resolve1(x)
153    if not isinstance(x, bytes):
154        if settings.STRICT:
155            raise PDFTypeError('String required: %r' % x)
156        return ''
157    return x
158
159
160def list_value(x):
161    x = resolve1(x)
162    if not isinstance(x, (list, tuple)):
163        if settings.STRICT:
164            raise PDFTypeError('List required: %r' % x)
165        return []
166    return x
167
168
169def dict_value(x):
170    x = resolve1(x)
171    if not isinstance(x, dict):
172        if settings.STRICT:
173            log.error('PDFTypeError : Dict required: %r', x)
174            raise PDFTypeError('Dict required: %r' % x)
175        return {}
176    return x
177
178
179def stream_value(x):
180    x = resolve1(x)
181    if not isinstance(x, PDFStream):
182        if settings.STRICT:
183            raise PDFTypeError('PDFStream required: %r' % x)
184        return PDFStream({}, b'')
185    return x
186
187
188class PDFStream(PDFObject):
189
190    def __init__(self, attrs, rawdata, decipher=None):
191        assert isinstance(attrs, dict), str(type(attrs))
192        self.attrs = attrs
193        self.rawdata = rawdata
194        self.decipher = decipher
195        self.data = None
196        self.objid = None
197        self.genno = None
198        return
199
200    def set_objid(self, objid, genno):
201        self.objid = objid
202        self.genno = genno
203        return
204
205    def __repr__(self):
206        if self.data is None:
207            assert self.rawdata is not None
208            return '<PDFStream(%r): raw=%d, %r>' % \
209                   (self.objid, len(self.rawdata), self.attrs)
210        else:
211            assert self.data is not None
212            return '<PDFStream(%r): len=%d, %r>' % \
213                   (self.objid, len(self.data), self.attrs)
214
215    def __contains__(self, name):
216        return name in self.attrs
217
218    def __getitem__(self, name):
219        return self.attrs[name]
220
221    def get(self, name, default=None):
222        return self.attrs.get(name, default)
223
224    def get_any(self, names, default=None):
225        for name in names:
226            if name in self.attrs:
227                return self.attrs[name]
228        return default
229
230    def get_filters(self):
231        filters = self.get_any(('F', 'Filter'))
232        params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
233        if not filters:
234            return []
235        if not isinstance(filters, list):
236            filters = [filters]
237        if not isinstance(params, list):
238            # Make sure the parameters list is the same as filters.
239            params = [params] * len(filters)
240        if settings.STRICT and len(params) != len(filters):
241            raise PDFException("Parameters len filter mismatch")
242        # resolve filter if possible
243        _filters = []
244        for fltr in filters:
245            if hasattr(fltr, 'resolve'):
246                fltr = fltr.resolve()[0]
247            _filters.append(fltr)
248        # return list solves https://github.com/pdfminer/pdfminer.six/issues/15
249        return list(zip(_filters, params))
250
251    def decode(self):
252        assert self.data is None \
253               and self.rawdata is not None, str((self.data, self.rawdata))
254        data = self.rawdata
255        if self.decipher:
256            # Handle encryption
257            data = self.decipher(self.objid, self.genno, data, self.attrs)
258        filters = self.get_filters()
259        if not filters:
260            self.data = data
261            self.rawdata = None
262            return
263        for (f, params) in filters:
264            if f in LITERALS_FLATE_DECODE:
265                # will get errors if the document is encrypted.
266                try:
267                    data = zlib.decompress(data)
268                except zlib.error as e:
269                    if settings.STRICT:
270                        error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
271                            .format(e, data)
272                        raise PDFException(error_msg)
273                    data = b''
274            elif f in LITERALS_LZW_DECODE:
275                data = lzwdecode(data)
276            elif f in LITERALS_ASCII85_DECODE:
277                data = ascii85decode(data)
278            elif f in LITERALS_ASCIIHEX_DECODE:
279                data = asciihexdecode(data)
280            elif f in LITERALS_RUNLENGTH_DECODE:
281                data = rldecode(data)
282            elif f in LITERALS_CCITTFAX_DECODE:
283                data = ccittfaxdecode(data, params)
284            elif f in LITERALS_DCT_DECODE:
285                # This is probably a JPG stream
286                # it does not need to be decoded twice.
287                # Just return the stream to the user.
288                pass
289            elif f in LITERALS_JBIG2_DECODE:
290                pass
291            elif f == LITERAL_CRYPT:
292                # not yet..
293                raise PDFNotImplementedError('/Crypt filter is unsupported')
294            else:
295                raise PDFNotImplementedError('Unsupported filter: %r' % f)
296            # apply predictors
297            if params and 'Predictor' in params:
298                pred = int_value(params['Predictor'])
299                if pred == 1:
300                    # no predictor
301                    pass
302                elif 10 <= pred:
303                    # PNG predictor
304                    colors = int_value(params.get('Colors', 1))
305                    columns = int_value(params.get('Columns', 1))
306                    raw_bits_per_component = params.get('BitsPerComponent', 8)
307                    bitspercomponent = int_value(raw_bits_per_component)
308                    data = apply_png_predictor(pred, colors, columns,
309                                               bitspercomponent, data)
310                else:
311                    error_msg = 'Unsupported predictor: %r' % pred
312                    raise PDFNotImplementedError(error_msg)
313        self.data = data
314        self.rawdata = None
315        return
316
317    def get_data(self):
318        if self.data is None:
319            self.decode()
320        return self.data
321
322    def get_rawdata(self):
323        return self.rawdata
324