1import zlib 2import logging 3from .lzw import lzwdecode 4from .ascii85 import ascii85decode 5from .ascii85 import asciihexdecode 6from .runlength import rldecode 7from .ccitt import ccittfaxdecode 8from .psparser import PSException 9from .psparser import PSObject 10from .psparser import LIT 11from . import settings 12from .utils import apply_png_predictor 13from .utils import isnumber 14 15 16log = logging.getLogger(__name__) 17 18LITERAL_CRYPT = LIT('Crypt') 19 20# Abbreviation of Filter names in PDF 4.8.6. "Inline Images" 21LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) 22LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) 23LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) 24LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) 25LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) 26LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) 27LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) 28LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) 29 30 31class PDFObject(PSObject): 32 pass 33 34 35class PDFException(PSException): 36 pass 37 38 39class PDFTypeError(PDFException): 40 pass 41 42 43class PDFValueError(PDFException): 44 pass 45 46 47class PDFObjectNotFound(PDFException): 48 pass 49 50 51class PDFNotImplementedError(PDFException): 52 pass 53 54 55class PDFObjRef(PDFObject): 56 57 def __init__(self, doc, objid, _): 58 if objid == 0: 59 if settings.STRICT: 60 raise PDFValueError('PDF object id cannot be 0.') 61 self.doc = doc 62 self.objid = objid 63 return 64 65 def __repr__(self): 66 return '<PDFObjRef:%d>' % (self.objid) 67 68 def resolve(self, default=None): 69 try: 70 return self.doc.getobj(self.objid) 71 except PDFObjectNotFound: 72 return default 73 74 75def resolve1(x, default=None): 76 """Resolves an object. 77 78 If this is an array or dictionary, it may still contains 79 some indirect objects inside. 80 """ 81 while isinstance(x, PDFObjRef): 82 x = x.resolve(default=default) 83 return x 84 85 86def resolve_all(x, default=None): 87 """Recursively resolves the given object and all the internals. 88 89 Make sure there is no indirect reference within the nested object. 90 This procedure might be slow. 91 """ 92 while isinstance(x, PDFObjRef): 93 x = x.resolve(default=default) 94 if isinstance(x, list): 95 x = [resolve_all(v, default=default) for v in x] 96 elif isinstance(x, dict): 97 for (k, v) in x.items(): 98 x[k] = resolve_all(v, default=default) 99 return x 100 101 102def decipher_all(decipher, objid, genno, x): 103 """Recursively deciphers the given object. 104 """ 105 if isinstance(x, bytes): 106 return decipher(objid, genno, x) 107 if isinstance(x, list): 108 x = [decipher_all(decipher, objid, genno, v) for v in x] 109 elif isinstance(x, dict): 110 for (k, v) in x.items(): 111 x[k] = decipher_all(decipher, objid, genno, v) 112 return x 113 114 115def int_value(x): 116 x = resolve1(x) 117 if not isinstance(x, int): 118 if settings.STRICT: 119 raise PDFTypeError('Integer required: %r' % x) 120 return 0 121 return x 122 123 124def float_value(x): 125 x = resolve1(x) 126 if not isinstance(x, float): 127 if settings.STRICT: 128 raise PDFTypeError('Float required: %r' % x) 129 return 0.0 130 return x 131 132 133def num_value(x): 134 x = resolve1(x) 135 if not isnumber(x): 136 if settings.STRICT: 137 raise PDFTypeError('Int or Float required: %r' % x) 138 return 0 139 return x 140 141 142def uint_value(x, n_bits): 143 """Resolve number and interpret it as a two's-complement unsigned number""" 144 x = int_value(x) 145 if x > 0: 146 return x 147 else: 148 return x + 2**n_bits 149 150 151def str_value(x): 152 x = resolve1(x) 153 if not isinstance(x, bytes): 154 if settings.STRICT: 155 raise PDFTypeError('String required: %r' % x) 156 return '' 157 return x 158 159 160def list_value(x): 161 x = resolve1(x) 162 if not isinstance(x, (list, tuple)): 163 if settings.STRICT: 164 raise PDFTypeError('List required: %r' % x) 165 return [] 166 return x 167 168 169def dict_value(x): 170 x = resolve1(x) 171 if not isinstance(x, dict): 172 if settings.STRICT: 173 log.error('PDFTypeError : Dict required: %r', x) 174 raise PDFTypeError('Dict required: %r' % x) 175 return {} 176 return x 177 178 179def stream_value(x): 180 x = resolve1(x) 181 if not isinstance(x, PDFStream): 182 if settings.STRICT: 183 raise PDFTypeError('PDFStream required: %r' % x) 184 return PDFStream({}, b'') 185 return x 186 187 188class PDFStream(PDFObject): 189 190 def __init__(self, attrs, rawdata, decipher=None): 191 assert isinstance(attrs, dict), str(type(attrs)) 192 self.attrs = attrs 193 self.rawdata = rawdata 194 self.decipher = decipher 195 self.data = None 196 self.objid = None 197 self.genno = None 198 return 199 200 def set_objid(self, objid, genno): 201 self.objid = objid 202 self.genno = genno 203 return 204 205 def __repr__(self): 206 if self.data is None: 207 assert self.rawdata is not None 208 return '<PDFStream(%r): raw=%d, %r>' % \ 209 (self.objid, len(self.rawdata), self.attrs) 210 else: 211 assert self.data is not None 212 return '<PDFStream(%r): len=%d, %r>' % \ 213 (self.objid, len(self.data), self.attrs) 214 215 def __contains__(self, name): 216 return name in self.attrs 217 218 def __getitem__(self, name): 219 return self.attrs[name] 220 221 def get(self, name, default=None): 222 return self.attrs.get(name, default) 223 224 def get_any(self, names, default=None): 225 for name in names: 226 if name in self.attrs: 227 return self.attrs[name] 228 return default 229 230 def get_filters(self): 231 filters = self.get_any(('F', 'Filter')) 232 params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) 233 if not filters: 234 return [] 235 if not isinstance(filters, list): 236 filters = [filters] 237 if not isinstance(params, list): 238 # Make sure the parameters list is the same as filters. 239 params = [params] * len(filters) 240 if settings.STRICT and len(params) != len(filters): 241 raise PDFException("Parameters len filter mismatch") 242 # resolve filter if possible 243 _filters = [] 244 for fltr in filters: 245 if hasattr(fltr, 'resolve'): 246 fltr = fltr.resolve()[0] 247 _filters.append(fltr) 248 # return list solves https://github.com/pdfminer/pdfminer.six/issues/15 249 return list(zip(_filters, params)) 250 251 def decode(self): 252 assert self.data is None \ 253 and self.rawdata is not None, str((self.data, self.rawdata)) 254 data = self.rawdata 255 if self.decipher: 256 # Handle encryption 257 data = self.decipher(self.objid, self.genno, data, self.attrs) 258 filters = self.get_filters() 259 if not filters: 260 self.data = data 261 self.rawdata = None 262 return 263 for (f, params) in filters: 264 if f in LITERALS_FLATE_DECODE: 265 # will get errors if the document is encrypted. 266 try: 267 data = zlib.decompress(data) 268 except zlib.error as e: 269 if settings.STRICT: 270 error_msg = 'Invalid zlib bytes: {!r}, {!r}'\ 271 .format(e, data) 272 raise PDFException(error_msg) 273 data = b'' 274 elif f in LITERALS_LZW_DECODE: 275 data = lzwdecode(data) 276 elif f in LITERALS_ASCII85_DECODE: 277 data = ascii85decode(data) 278 elif f in LITERALS_ASCIIHEX_DECODE: 279 data = asciihexdecode(data) 280 elif f in LITERALS_RUNLENGTH_DECODE: 281 data = rldecode(data) 282 elif f in LITERALS_CCITTFAX_DECODE: 283 data = ccittfaxdecode(data, params) 284 elif f in LITERALS_DCT_DECODE: 285 # This is probably a JPG stream 286 # it does not need to be decoded twice. 287 # Just return the stream to the user. 288 pass 289 elif f in LITERALS_JBIG2_DECODE: 290 pass 291 elif f == LITERAL_CRYPT: 292 # not yet.. 293 raise PDFNotImplementedError('/Crypt filter is unsupported') 294 else: 295 raise PDFNotImplementedError('Unsupported filter: %r' % f) 296 # apply predictors 297 if params and 'Predictor' in params: 298 pred = int_value(params['Predictor']) 299 if pred == 1: 300 # no predictor 301 pass 302 elif 10 <= pred: 303 # PNG predictor 304 colors = int_value(params.get('Colors', 1)) 305 columns = int_value(params.get('Columns', 1)) 306 raw_bits_per_component = params.get('BitsPerComponent', 8) 307 bitspercomponent = int_value(raw_bits_per_component) 308 data = apply_png_predictor(pred, colors, columns, 309 bitspercomponent, data) 310 else: 311 error_msg = 'Unsupported predictor: %r' % pred 312 raise PDFNotImplementedError(error_msg) 313 self.data = data 314 self.rawdata = None 315 return 316 317 def get_data(self): 318 if self.data is None: 319 self.decode() 320 return self.data 321 322 def get_rawdata(self): 323 return self.rawdata 324