1"""Parser for the "marshal" file format. 2 3This file is adapted from pypy/lib_pypy/_marshal.py. 4 5This module contains functions that can read and write Python values in a binary 6format. The format is specific to Python, but independent of machine 7architecture issues (e.g., you can write a Python value to a file on a PC, 8 transport the file to a Sun, and read it back there). 9Details of the format may change between Python versions. 10""" 11 12import contextlib 13import struct 14import sys 15 16from pytype import utils 17 18 19TYPE_NULL = 0x30 # '0' 20TYPE_NONE = 0x4e # 'N' 21TYPE_FALSE = 0x46 # 'F' 22TYPE_TRUE = 0x54 # 'T' 23TYPE_STOPITER = 0x53 # 'S' 24TYPE_ELLIPSIS = 0x2e # '.' 25TYPE_INT = 0x69 # 'i' 26TYPE_INT64 = 0x49 # 'I' 27TYPE_FLOAT = 0x66 # 'f' 28TYPE_BINARY_FLOAT = 0x67 # 'g' 29TYPE_COMPLEX = 0x78 # 'x' 30TYPE_BINARY_COMPLEX = 0x79 # 'y' 31TYPE_LONG = 0x6c # 'l' 32TYPE_STRING = 0x73 # 's' 33TYPE_INTERNED = 0x74 # 't' 34TYPE_STRINGREF = 0x52 # 'R' 35TYPE_TUPLE = 0x28 # '(' 36TYPE_LIST = 0x5b # '[' 37TYPE_DICT = 0x7b # '{' 38TYPE_CODE = 0x63 # 'c' 39TYPE_UNICODE = 0x75 # 'u' 40TYPE_UNKNOWN = 0x3f # '?', CPython uses this for error reporting 41TYPE_SET = 0x3c # '<' 42TYPE_FROZENSET = 0x3e # '>' 43TYPE_REF = 0x72 # 'r' 44TYPE_ASCII = 0x61 # 'a' 45TYPE_ASCII_INTERNED = 0x41 # 'A' 46TYPE_SMALL_TUPLE = 0x29 # ')' 47TYPE_SHORT_ASCII = 0x7a # 'z' 48TYPE_SHORT_ASCII_INTERNED = 0x5a # 'Z' 49 50# Masks and values used by FORMAT_VALUE opcode. 51FVC_MASK = 0x3 52FVC_NONE = 0x0 53FVC_STR = 0x1 54FVC_REPR = 0x2 55FVC_ASCII = 0x3 56FVS_MASK = 0x4 57FVS_HAVE_SPEC = 0x4 58 59# Flag used by CALL_FUNCTION_EX 60CALL_FUNCTION_EX_HAS_KWARGS = 0x1 61 62# Flags used by MAKE_FUNCTION 63MAKE_FUNCTION_HAS_POS_DEFAULTS = 0x1 64MAKE_FUNCTION_HAS_KW_DEFAULTS = 0x2 65MAKE_FUNCTION_HAS_ANNOTATIONS = 0x4 66MAKE_FUNCTION_HAS_FREE_VARS = 0x8 67 68# Or-ing this flag to one of the codes above will cause the decoded value to 69# be stored in a reference table for later lookup. This feature was added in 70# Python 3.4. 71REF = 0x80 72 73 74class _NULL: 75 """Used internally, e.g. as a sentinel in dictionary entry lists.""" 76 77 78class CodeType: 79 """Version-agnostic types.CodeType.""" 80 81 # for co_flags: 82 CO_OPTIMIZED = 0x0001 83 CO_NEWLOCALS = 0x0002 84 CO_VARARGS = 0x0004 85 CO_VARKEYWORDS = 0x0008 86 CO_NESTED = 0x0010 87 CO_GENERATOR = 0x0020 88 CO_NOFREE = 0x0040 89 CO_COROUTINE = 0x0080 90 CO_ITERABLE_COROUTINE = 0x0100 91 CO_ASYNC_GENERATOR = 0x0200 92 CO_FUTURE_DIVISION = 0x2000 93 CO_FUTURE_ABSOLUTE_IMPORT = 0x4000 94 CO_FUTURE_WITH_STATEMENT = 0x8000 95 CO_FUTURE_PRINT_FUNCTION = 0x10000 96 CO_FUTURE_UNICODE_LITERALS = 0x20000 97 98 def __init__(self, argcount, posonlyargcount, kwonlyargcount, nlocals, 99 stacksize, flags, code, consts, names, varnames, filename, 100 name, firstlineno, lnotab, freevars, cellvars, python_version): 101 assert isinstance(nlocals, int) 102 assert isinstance(stacksize, int) 103 assert isinstance(flags, int) 104 assert isinstance(filename, (bytes, str)) 105 self.co_argcount = argcount 106 self.co_posonlyargcount = posonlyargcount 107 self.co_kwonlyargcount = kwonlyargcount 108 self.co_nlocals = nlocals 109 self.co_stacksize = stacksize 110 self.co_flags = flags 111 self.co_code = code 112 self.co_consts = consts 113 self.co_names = names 114 self.co_varnames = varnames 115 self.co_filename = filename 116 self.co_name = name 117 self.co_firstlineno = firstlineno 118 self.co_lnotab = lnotab 119 self.co_freevars = freevars 120 self.co_cellvars = cellvars 121 self.python_version = python_version # This field is not in types.CodeType. 122 123 def __repr__(self): 124 return f'<code: {self.co_name}>' 125 126 127class _LoadMarshal: 128 """Stateful loader for marshalled files.""" 129 130 def __init__(self, data, python_version): 131 self.bufstr = data 132 self.bufpos = 0 133 self.python_version = python_version 134 self.refs = [] 135 self._stringtable = [] 136 # When running under Python 3 and analyzing Python 2, whether load_string 137 # should convert bytes to native strings. 138 self._keep_bytes = False 139 140 @contextlib.contextmanager 141 def keep_bytes(self): 142 old = self._keep_bytes 143 self._keep_bytes = True 144 yield 145 self._keep_bytes = old 146 147 def eof(self): 148 """Return True if we reached the end of the stream.""" 149 return self.bufpos == len(self.bufstr) 150 151 def load(self): 152 """Load an encoded Python data structure.""" 153 c = ord('?') # make pylint happy 154 try: 155 c = self._read_byte() 156 if c & REF: 157 # This element might recursively contain other elements, which 158 # themselves store things in the refs table. So we need to determine the 159 # index position *before* reading the contents of this element. 160 idx = self._reserve_ref() 161 result = _LoadMarshal.dispatch[c & ~REF](self) 162 self.refs[idx] = result 163 else: 164 result = _LoadMarshal.dispatch[c](self) 165 return result 166 except KeyError as e: 167 raise ValueError('bad marshal code: %r (%02x)' % (chr(c), c)) from e 168 except IndexError as e: 169 raise EOFError() from e 170 171 def _read(self, n): 172 """Read n bytes as a string.""" 173 pos = self.bufpos 174 self.bufpos += n 175 if self.bufpos > len(self.bufstr): 176 raise EOFError() 177 return self.bufstr[pos : self.bufpos] 178 179 def _read_byte(self): 180 """Read an unsigned byte.""" 181 pos = self.bufpos 182 self.bufpos += 1 183 return self.bufstr[pos] 184 185 def _read_short(self): 186 """Read a signed 16 bit word.""" 187 lo = self._read_byte() 188 hi = self._read_byte() 189 x = lo | (hi<<8) 190 if x & 0x8000: 191 # sign extension 192 x -= 0x10000 193 return x 194 195 def _read_long(self): 196 """Read a signed 32 bit word.""" 197 s = self._read(4) 198 b = lambda i: s[i] 199 x = b(0) | b(1)<<8 | b(2)<<16 | b(3)<<24 200 if b(3) & 0x80 and x > 0: 201 # sign extension 202 x = -((1<<32) - x) 203 return int(x) 204 else: 205 return x 206 207 def _read_long64(self): 208 """Read a signed 64 bit integer.""" 209 s = self._read(8) 210 b = lambda i: s[i] 211 x = (b(0) | b(1)<<8 | b(2)<<16 | b(3)<<24 | 212 b(4)<<32 | b(5)<<40 | b(6)<<48 | b(7)<<56) 213 if b(7) & 0x80 and x > 0: 214 # sign extension 215 x = -((1<<64) - x) 216 return x 217 218 def _reserve_ref(self): 219 """Reserve one entry in the reference table. 220 221 This is done before reading an element, because reading an element and 222 all its subelements might change the size of the reference table. 223 224 Returns: 225 Reserved index position in the reference table. 226 """ 227 # See r_ref_reserve in Python-3.4/Python/marshal.c 228 idx = len(self.refs) 229 self.refs.append(None) 230 return idx 231 232 # pylint: disable=missing-docstring 233 # This is a bunch of small methods with self-explanatory names. 234 235 def load_null(self): 236 return _NULL 237 238 def load_none(self): 239 return None 240 241 def load_true(self): 242 return True 243 244 def load_false(self): 245 return False 246 247 def load_stopiter(self): 248 return StopIteration 249 250 def load_ellipsis(self): 251 return Ellipsis 252 253 def load_int(self): 254 return self._read_long() 255 256 def load_int64(self): 257 return self._read_long64() 258 259 def load_long(self): 260 """Load a variable length integer.""" 261 size = self._read_long() 262 x = 0 263 for i in range(abs(size)): 264 d = self._read_short() 265 x |= d<<(i*15) 266 return x if size >= 0 else -x 267 268 def load_float(self): 269 n = self._read_byte() 270 s = self._read(n) 271 return float(s) 272 273 def load_binary_float(self): 274 binary = self._read(8) 275 return struct.unpack('<d', binary)[0] 276 277 def load_complex(self): 278 n = self._read_byte() 279 s = self._read(n) 280 real = float(s) 281 n = self._read_byte() 282 s = self._read(n) 283 imag = float(s) 284 return complex(real, imag) 285 286 def load_binary_complex(self): 287 binary = self._read(16) 288 return complex(*struct.unpack('dd', binary)) 289 290 def load_string(self): 291 n = self._read_long() 292 s = bytes(self._read(n)) 293 return s 294 295 def load_interned(self): 296 n = self._read_long() 297 s = self._read(n) 298 ret = sys.intern(utils.native_str(s)) 299 self._stringtable.append(ret) 300 return ret 301 302 def load_stringref(self): 303 n = self._read_long() 304 return self._stringtable[n] 305 306 def load_unicode(self): 307 n = self._read_long() 308 s = self._read(n) 309 # We need to convert bytes to a unicode string. 310 # We use the 'backslashreplace' error mode in order to handle non-utf8 311 # backslash-escaped string literals correctly. 312 s = s.decode('utf8', 'backslashreplace') 313 return s 314 315 def load_ascii(self): 316 n = self._read_long() 317 return utils.native_str(self._read(n)) 318 319 def load_short_ascii(self): 320 n = self._read_byte() 321 return utils.native_str(self._read(n)) 322 323 def load_tuple(self): 324 return tuple(self.load_list()) 325 326 def load_small_tuple(self): 327 n = self._read_byte() 328 l = [] 329 for _ in range(n): 330 l.append(self.load()) 331 return tuple(l) 332 333 def load_list(self): 334 n = self._read_long() 335 l = [] 336 for _ in range(n): 337 l.append(self.load()) 338 return l 339 340 def load_dict(self): 341 d = {} 342 while True: 343 key = self.load() 344 if key is _NULL: 345 break 346 value = self.load() 347 d[key] = value 348 return d 349 350 def load_code(self): 351 """Load a Python code object.""" 352 argcount = self._read_long() 353 # Python 3.8+ has positional only arguments. 354 if self.python_version >= (3, 8): 355 posonlyargcount = self._read_long() 356 else: 357 posonlyargcount = -1 358 if self.python_version[0] >= 3: 359 kwonlyargcount = self._read_long() 360 else: 361 kwonlyargcount = -1 362 nlocals = self._read_long() 363 stacksize = self._read_long() 364 flags = self._read_long() 365 with self.keep_bytes(): 366 # The code field is a 'string of raw compiled bytecode' 367 # (https://docs.python.org/3/library/inspect.html#types-and-members). 368 code = self.load() 369 consts = self.load() 370 names = self.load() 371 varnames = self.load() 372 freevars = self.load() 373 cellvars = self.load() 374 filename = self.load() 375 name = self.load() 376 firstlineno = self._read_long() 377 with self.keep_bytes(): 378 # lnotab, from 379 # https://github.com/python/cpython/blob/master/Objects/lnotab_notes.txt: 380 # 'an array of unsigned bytes disguised as a Python bytes object'. 381 lnotab = self.load() 382 return CodeType(argcount, posonlyargcount, kwonlyargcount, nlocals, 383 stacksize, flags, code, consts, names, varnames, filename, 384 name, firstlineno, lnotab, freevars, cellvars, 385 self.python_version) 386 387 def load_set(self): 388 n = self._read_long() 389 args = [self.load() for _ in range(n)] 390 return set(args) 391 392 def load_frozenset(self): 393 n = self._read_long() 394 args = [self.load() for _ in range(n)] 395 return frozenset(args) 396 397 def load_ref(self): 398 n = self._read_long() 399 return self.refs[n] 400 401 # pylint: enable=missing-docstring 402 403 dispatch = { 404 TYPE_ASCII: load_ascii, 405 TYPE_ASCII_INTERNED: load_ascii, 406 TYPE_BINARY_COMPLEX: load_binary_complex, 407 TYPE_BINARY_FLOAT: load_binary_float, 408 TYPE_CODE: load_code, 409 TYPE_COMPLEX: load_complex, 410 TYPE_DICT: load_dict, 411 TYPE_ELLIPSIS: load_ellipsis, 412 TYPE_FALSE: load_false, 413 TYPE_FLOAT: load_float, 414 TYPE_FROZENSET: load_frozenset, 415 TYPE_INT64: load_int64, 416 TYPE_INT: load_int, 417 TYPE_INTERNED: load_interned, 418 TYPE_LIST: load_list, 419 TYPE_LONG: load_long, 420 TYPE_NONE: load_none, 421 TYPE_NULL: load_null, 422 TYPE_REF: load_ref, 423 TYPE_SET: load_set, 424 TYPE_SHORT_ASCII: load_short_ascii, 425 TYPE_SHORT_ASCII_INTERNED: load_short_ascii, 426 TYPE_SMALL_TUPLE: load_small_tuple, 427 TYPE_STOPITER: load_stopiter, 428 TYPE_STRING: load_string, 429 TYPE_STRINGREF: load_stringref, 430 TYPE_TRUE: load_true, 431 TYPE_TUPLE: load_tuple, 432 TYPE_UNICODE: load_unicode, 433 } 434 435 436def loads(s, python_version): 437 um = _LoadMarshal(s, python_version) 438 result = um.load() 439 if not um.eof(): 440 raise BufferError('trailing bytes in marshal data') 441 return result 442