1"""Parser for the "marshal" file format.
2
3This file is adapted from pypy/lib_pypy/_marshal.py.
4
5This module contains functions that can read and write Python values in a binary
6format. The format is specific to Python, but independent of machine
7architecture issues (e.g., you can write a Python value to a file on a PC,
8           transport the file to a Sun, and read it back there).
9Details of the format may change between Python versions.
10"""
11
12import contextlib
13import struct
14import sys
15
16from pytype import utils
17
18
19TYPE_NULL = 0x30  # '0'
20TYPE_NONE = 0x4e  # 'N'
21TYPE_FALSE = 0x46  # 'F'
22TYPE_TRUE = 0x54  # 'T'
23TYPE_STOPITER = 0x53  # 'S'
24TYPE_ELLIPSIS = 0x2e  # '.'
25TYPE_INT = 0x69  # 'i'
26TYPE_INT64 = 0x49  # 'I'
27TYPE_FLOAT = 0x66  # 'f'
28TYPE_BINARY_FLOAT = 0x67  # 'g'
29TYPE_COMPLEX = 0x78  # 'x'
30TYPE_BINARY_COMPLEX = 0x79  # 'y'
31TYPE_LONG = 0x6c  # 'l'
32TYPE_STRING = 0x73  # 's'
33TYPE_INTERNED = 0x74  # 't'
34TYPE_STRINGREF = 0x52  # 'R'
35TYPE_TUPLE = 0x28  # '('
36TYPE_LIST = 0x5b  # '['
37TYPE_DICT = 0x7b  # '{'
38TYPE_CODE = 0x63  # 'c'
39TYPE_UNICODE = 0x75  # 'u'
40TYPE_UNKNOWN = 0x3f  # '?', CPython uses this for error reporting
41TYPE_SET = 0x3c  # '<'
42TYPE_FROZENSET = 0x3e  # '>'
43TYPE_REF = 0x72  # 'r'
44TYPE_ASCII = 0x61  # 'a'
45TYPE_ASCII_INTERNED = 0x41  # 'A'
46TYPE_SMALL_TUPLE = 0x29  # ')'
47TYPE_SHORT_ASCII = 0x7a  # 'z'
48TYPE_SHORT_ASCII_INTERNED = 0x5a  # 'Z'
49
50# Masks and values used by FORMAT_VALUE opcode.
51FVC_MASK = 0x3
52FVC_NONE = 0x0
53FVC_STR = 0x1
54FVC_REPR = 0x2
55FVC_ASCII = 0x3
56FVS_MASK = 0x4
57FVS_HAVE_SPEC = 0x4
58
59# Flag used by CALL_FUNCTION_EX
60CALL_FUNCTION_EX_HAS_KWARGS = 0x1
61
62# Flags used by MAKE_FUNCTION
63MAKE_FUNCTION_HAS_POS_DEFAULTS = 0x1
64MAKE_FUNCTION_HAS_KW_DEFAULTS = 0x2
65MAKE_FUNCTION_HAS_ANNOTATIONS = 0x4
66MAKE_FUNCTION_HAS_FREE_VARS = 0x8
67
68# Or-ing this flag to one of the codes above will cause the decoded value to
69# be stored in a reference table for later lookup. This feature was added in
70# Python 3.4.
71REF = 0x80
72
73
74class _NULL:
75  """Used internally, e.g. as a sentinel in dictionary entry lists."""
76
77
78class CodeType:
79  """Version-agnostic types.CodeType."""
80
81  # for co_flags:
82  CO_OPTIMIZED = 0x0001
83  CO_NEWLOCALS = 0x0002
84  CO_VARARGS = 0x0004
85  CO_VARKEYWORDS = 0x0008
86  CO_NESTED = 0x0010
87  CO_GENERATOR = 0x0020
88  CO_NOFREE = 0x0040
89  CO_COROUTINE = 0x0080
90  CO_ITERABLE_COROUTINE = 0x0100
91  CO_ASYNC_GENERATOR = 0x0200
92  CO_FUTURE_DIVISION = 0x2000
93  CO_FUTURE_ABSOLUTE_IMPORT = 0x4000
94  CO_FUTURE_WITH_STATEMENT = 0x8000
95  CO_FUTURE_PRINT_FUNCTION = 0x10000
96  CO_FUTURE_UNICODE_LITERALS = 0x20000
97
98  def __init__(self, argcount, posonlyargcount, kwonlyargcount, nlocals,
99               stacksize, flags, code, consts, names, varnames, filename,
100               name, firstlineno, lnotab, freevars, cellvars, python_version):
101    assert isinstance(nlocals, int)
102    assert isinstance(stacksize, int)
103    assert isinstance(flags, int)
104    assert isinstance(filename, (bytes, str))
105    self.co_argcount = argcount
106    self.co_posonlyargcount = posonlyargcount
107    self.co_kwonlyargcount = kwonlyargcount
108    self.co_nlocals = nlocals
109    self.co_stacksize = stacksize
110    self.co_flags = flags
111    self.co_code = code
112    self.co_consts = consts
113    self.co_names = names
114    self.co_varnames = varnames
115    self.co_filename = filename
116    self.co_name = name
117    self.co_firstlineno = firstlineno
118    self.co_lnotab = lnotab
119    self.co_freevars = freevars
120    self.co_cellvars = cellvars
121    self.python_version = python_version  # This field is not in types.CodeType.
122
123  def __repr__(self):
124    return f'<code: {self.co_name}>'
125
126
127class _LoadMarshal:
128  """Stateful loader for marshalled files."""
129
130  def __init__(self, data, python_version):
131    self.bufstr = data
132    self.bufpos = 0
133    self.python_version = python_version
134    self.refs = []
135    self._stringtable = []
136    # When running under Python 3 and analyzing Python 2, whether load_string
137    # should convert bytes to native strings.
138    self._keep_bytes = False
139
140  @contextlib.contextmanager
141  def keep_bytes(self):
142    old = self._keep_bytes
143    self._keep_bytes = True
144    yield
145    self._keep_bytes = old
146
147  def eof(self):
148    """Return True if we reached the end of the stream."""
149    return self.bufpos == len(self.bufstr)
150
151  def load(self):
152    """Load an encoded Python data structure."""
153    c = ord('?')  # make pylint happy
154    try:
155      c = self._read_byte()
156      if c & REF:
157        # This element might recursively contain other elements, which
158        # themselves store things in the refs table. So we need to determine the
159        # index position *before* reading the contents of this element.
160        idx = self._reserve_ref()
161        result = _LoadMarshal.dispatch[c & ~REF](self)
162        self.refs[idx] = result
163      else:
164        result = _LoadMarshal.dispatch[c](self)
165      return result
166    except KeyError as e:
167      raise ValueError('bad marshal code: %r (%02x)' % (chr(c), c)) from e
168    except IndexError as e:
169      raise EOFError() from e
170
171  def _read(self, n):
172    """Read n bytes as a string."""
173    pos = self.bufpos
174    self.bufpos += n
175    if self.bufpos > len(self.bufstr):
176      raise EOFError()
177    return self.bufstr[pos : self.bufpos]
178
179  def _read_byte(self):
180    """Read an unsigned byte."""
181    pos = self.bufpos
182    self.bufpos += 1
183    return self.bufstr[pos]
184
185  def _read_short(self):
186    """Read a signed 16 bit word."""
187    lo = self._read_byte()
188    hi = self._read_byte()
189    x = lo | (hi<<8)
190    if x & 0x8000:
191      # sign extension
192      x -= 0x10000
193    return x
194
195  def _read_long(self):
196    """Read a signed 32 bit word."""
197    s = self._read(4)
198    b = lambda i: s[i]
199    x = b(0) | b(1)<<8 | b(2)<<16 | b(3)<<24
200    if b(3) & 0x80 and x > 0:
201      # sign extension
202      x = -((1<<32) - x)
203      return int(x)
204    else:
205      return x
206
207  def _read_long64(self):
208    """Read a signed 64 bit integer."""
209    s = self._read(8)
210    b = lambda i: s[i]
211    x = (b(0) | b(1)<<8 | b(2)<<16 | b(3)<<24 |
212         b(4)<<32 | b(5)<<40 | b(6)<<48 | b(7)<<56)
213    if b(7) & 0x80 and x > 0:
214      # sign extension
215      x = -((1<<64) - x)
216    return x
217
218  def _reserve_ref(self):
219    """Reserve one entry in the reference table.
220
221    This is done before reading an element, because reading an element and
222    all its subelements might change the size of the reference table.
223
224    Returns:
225      Reserved index position in the reference table.
226    """
227    # See r_ref_reserve in Python-3.4/Python/marshal.c
228    idx = len(self.refs)
229    self.refs.append(None)
230    return idx
231
232  # pylint: disable=missing-docstring
233  # This is a bunch of small methods with self-explanatory names.
234
235  def load_null(self):
236    return _NULL
237
238  def load_none(self):
239    return None
240
241  def load_true(self):
242    return True
243
244  def load_false(self):
245    return False
246
247  def load_stopiter(self):
248    return StopIteration
249
250  def load_ellipsis(self):
251    return Ellipsis
252
253  def load_int(self):
254    return self._read_long()
255
256  def load_int64(self):
257    return self._read_long64()
258
259  def load_long(self):
260    """Load a variable length integer."""
261    size = self._read_long()
262    x = 0
263    for i in range(abs(size)):
264      d = self._read_short()
265      x |= d<<(i*15)
266    return x if size >= 0 else -x
267
268  def load_float(self):
269    n = self._read_byte()
270    s = self._read(n)
271    return float(s)
272
273  def load_binary_float(self):
274    binary = self._read(8)
275    return struct.unpack('<d', binary)[0]
276
277  def load_complex(self):
278    n = self._read_byte()
279    s = self._read(n)
280    real = float(s)
281    n = self._read_byte()
282    s = self._read(n)
283    imag = float(s)
284    return complex(real, imag)
285
286  def load_binary_complex(self):
287    binary = self._read(16)
288    return complex(*struct.unpack('dd', binary))
289
290  def load_string(self):
291    n = self._read_long()
292    s = bytes(self._read(n))
293    return s
294
295  def load_interned(self):
296    n = self._read_long()
297    s = self._read(n)
298    ret = sys.intern(utils.native_str(s))
299    self._stringtable.append(ret)
300    return ret
301
302  def load_stringref(self):
303    n = self._read_long()
304    return self._stringtable[n]
305
306  def load_unicode(self):
307    n = self._read_long()
308    s = self._read(n)
309    # We need to convert bytes to a unicode string.
310    # We use the 'backslashreplace' error mode in order to handle non-utf8
311    # backslash-escaped string literals correctly.
312    s = s.decode('utf8', 'backslashreplace')
313    return s
314
315  def load_ascii(self):
316    n = self._read_long()
317    return utils.native_str(self._read(n))
318
319  def load_short_ascii(self):
320    n = self._read_byte()
321    return utils.native_str(self._read(n))
322
323  def load_tuple(self):
324    return tuple(self.load_list())
325
326  def load_small_tuple(self):
327    n = self._read_byte()
328    l = []
329    for _ in range(n):
330      l.append(self.load())
331    return tuple(l)
332
333  def load_list(self):
334    n = self._read_long()
335    l = []
336    for _ in range(n):
337      l.append(self.load())
338    return l
339
340  def load_dict(self):
341    d = {}
342    while True:
343      key = self.load()
344      if key is _NULL:
345        break
346      value = self.load()
347      d[key] = value
348    return d
349
350  def load_code(self):
351    """Load a Python code object."""
352    argcount = self._read_long()
353    # Python 3.8+ has positional only arguments.
354    if self.python_version >= (3, 8):
355      posonlyargcount = self._read_long()
356    else:
357      posonlyargcount = -1
358    if self.python_version[0] >= 3:
359      kwonlyargcount = self._read_long()
360    else:
361      kwonlyargcount = -1
362    nlocals = self._read_long()
363    stacksize = self._read_long()
364    flags = self._read_long()
365    with self.keep_bytes():
366      # The code field is a 'string of raw compiled bytecode'
367      # (https://docs.python.org/3/library/inspect.html#types-and-members).
368      code = self.load()
369    consts = self.load()
370    names = self.load()
371    varnames = self.load()
372    freevars = self.load()
373    cellvars = self.load()
374    filename = self.load()
375    name = self.load()
376    firstlineno = self._read_long()
377    with self.keep_bytes():
378      # lnotab, from
379      # https://github.com/python/cpython/blob/master/Objects/lnotab_notes.txt:
380      # 'an array of unsigned bytes disguised as a Python bytes object'.
381      lnotab = self.load()
382    return CodeType(argcount, posonlyargcount, kwonlyargcount, nlocals,
383                    stacksize, flags, code, consts, names, varnames, filename,
384                    name, firstlineno, lnotab, freevars, cellvars,
385                    self.python_version)
386
387  def load_set(self):
388    n = self._read_long()
389    args = [self.load() for _ in range(n)]
390    return set(args)
391
392  def load_frozenset(self):
393    n = self._read_long()
394    args = [self.load() for _ in range(n)]
395    return frozenset(args)
396
397  def load_ref(self):
398    n = self._read_long()
399    return self.refs[n]
400
401  # pylint: enable=missing-docstring
402
403  dispatch = {
404      TYPE_ASCII: load_ascii,
405      TYPE_ASCII_INTERNED: load_ascii,
406      TYPE_BINARY_COMPLEX: load_binary_complex,
407      TYPE_BINARY_FLOAT: load_binary_float,
408      TYPE_CODE: load_code,
409      TYPE_COMPLEX: load_complex,
410      TYPE_DICT: load_dict,
411      TYPE_ELLIPSIS: load_ellipsis,
412      TYPE_FALSE: load_false,
413      TYPE_FLOAT: load_float,
414      TYPE_FROZENSET: load_frozenset,
415      TYPE_INT64: load_int64,
416      TYPE_INT: load_int,
417      TYPE_INTERNED: load_interned,
418      TYPE_LIST: load_list,
419      TYPE_LONG: load_long,
420      TYPE_NONE: load_none,
421      TYPE_NULL: load_null,
422      TYPE_REF: load_ref,
423      TYPE_SET: load_set,
424      TYPE_SHORT_ASCII: load_short_ascii,
425      TYPE_SHORT_ASCII_INTERNED: load_short_ascii,
426      TYPE_SMALL_TUPLE: load_small_tuple,
427      TYPE_STOPITER: load_stopiter,
428      TYPE_STRING: load_string,
429      TYPE_STRINGREF: load_stringref,
430      TYPE_TRUE: load_true,
431      TYPE_TUPLE: load_tuple,
432      TYPE_UNICODE: load_unicode,
433  }
434
435
436def loads(s, python_version):
437  um = _LoadMarshal(s, python_version)
438  result = um.load()
439  if not um.eof():
440    raise BufferError('trailing bytes in marshal data')
441  return result
442