1 /* Peephole optimizations for bytecode compiler. */
2 
3 #include "Python.h"
4 
5 #include "Python-ast.h"
6 #include "node.h"
7 #include "ast.h"
8 #include "code.h"
9 #include "symtable.h"
10 #include "opcode.h"
11 #include "wordcode_helpers.h"
12 
13 #define UNCONDITIONAL_JUMP(op)  (op==JUMP_ABSOLUTE || op==JUMP_FORWARD)
14 #define CONDITIONAL_JUMP(op) (op==POP_JUMP_IF_FALSE || op==POP_JUMP_IF_TRUE \
15     || op==JUMP_IF_FALSE_OR_POP || op==JUMP_IF_TRUE_OR_POP)
16 #define ABSOLUTE_JUMP(op) (op==JUMP_ABSOLUTE \
17     || op==POP_JUMP_IF_FALSE || op==POP_JUMP_IF_TRUE \
18     || op==JUMP_IF_FALSE_OR_POP || op==JUMP_IF_TRUE_OR_POP || op==JUMP_IF_NOT_EXC_MATCH)
19 #define JUMPS_ON_TRUE(op) (op==POP_JUMP_IF_TRUE || op==JUMP_IF_TRUE_OR_POP)
20 #define GETJUMPTGT(arr, i) (get_arg(arr, i) / sizeof(_Py_CODEUNIT) + \
21         (ABSOLUTE_JUMP(_Py_OPCODE(arr[i])) ? 0 : i+1))
22 #define ISBASICBLOCK(blocks, start, end) \
23     (blocks[start]==blocks[end])
24 
25 
26 /* Scans back N consecutive LOAD_CONST instructions, skipping NOPs,
27    returns index of the Nth last's LOAD_CONST's EXTENDED_ARG prefix.
28    Callers are responsible to check CONST_STACK_LEN beforehand.
29 */
30 static Py_ssize_t
lastn_const_start(const _Py_CODEUNIT * codestr,Py_ssize_t i,Py_ssize_t n)31 lastn_const_start(const _Py_CODEUNIT *codestr, Py_ssize_t i, Py_ssize_t n)
32 {
33     assert(n > 0);
34     for (;;) {
35         i--;
36         assert(i >= 0);
37         if (_Py_OPCODE(codestr[i]) == LOAD_CONST) {
38             if (!--n) {
39                 while (i > 0 && _Py_OPCODE(codestr[i-1]) == EXTENDED_ARG) {
40                     i--;
41                 }
42                 return i;
43             }
44         }
45         else {
46             assert(_Py_OPCODE(codestr[i]) == EXTENDED_ARG);
47         }
48     }
49 }
50 
51 /* Scans through EXTENDED ARGs, seeking the index of the effective opcode */
52 static Py_ssize_t
find_op(const _Py_CODEUNIT * codestr,Py_ssize_t codelen,Py_ssize_t i)53 find_op(const _Py_CODEUNIT *codestr, Py_ssize_t codelen, Py_ssize_t i)
54 {
55     while (i < codelen && _Py_OPCODE(codestr[i]) == EXTENDED_ARG) {
56         i++;
57     }
58     return i;
59 }
60 
61 /* Given the index of the effective opcode,
62    scan back to construct the oparg with EXTENDED_ARG */
63 static unsigned int
get_arg(const _Py_CODEUNIT * codestr,Py_ssize_t i)64 get_arg(const _Py_CODEUNIT *codestr, Py_ssize_t i)
65 {
66     _Py_CODEUNIT word;
67     unsigned int oparg = _Py_OPARG(codestr[i]);
68     if (i >= 1 && _Py_OPCODE(word = codestr[i-1]) == EXTENDED_ARG) {
69         oparg |= _Py_OPARG(word) << 8;
70         if (i >= 2 && _Py_OPCODE(word = codestr[i-2]) == EXTENDED_ARG) {
71             oparg |= _Py_OPARG(word) << 16;
72             if (i >= 3 && _Py_OPCODE(word = codestr[i-3]) == EXTENDED_ARG) {
73                 oparg |= _Py_OPARG(word) << 24;
74             }
75         }
76     }
77     return oparg;
78 }
79 
80 /* Fill the region with NOPs. */
81 static void
fill_nops(_Py_CODEUNIT * codestr,Py_ssize_t start,Py_ssize_t end)82 fill_nops(_Py_CODEUNIT *codestr, Py_ssize_t start, Py_ssize_t end)
83 {
84     memset(codestr + start, NOP, (end - start) * sizeof(_Py_CODEUNIT));
85 }
86 
87 /* Given the index of the effective opcode,
88    attempt to replace the argument, taking into account EXTENDED_ARG.
89    Returns -1 on failure, or the new op index on success */
90 static Py_ssize_t
set_arg(_Py_CODEUNIT * codestr,Py_ssize_t i,unsigned int oparg)91 set_arg(_Py_CODEUNIT *codestr, Py_ssize_t i, unsigned int oparg)
92 {
93     unsigned int curarg = get_arg(codestr, i);
94     int curilen, newilen;
95     if (curarg == oparg)
96         return i;
97     curilen = instrsize(curarg);
98     newilen = instrsize(oparg);
99     if (curilen < newilen) {
100         return -1;
101     }
102 
103     write_op_arg(codestr + i + 1 - curilen, _Py_OPCODE(codestr[i]), oparg, newilen);
104     fill_nops(codestr, i + 1 - curilen + newilen, i + 1);
105     return i-curilen+newilen;
106 }
107 
108 /* Attempt to write op/arg at end of specified region of memory.
109    Preceding memory in the region is overwritten with NOPs.
110    Returns -1 on failure, op index on success */
111 static Py_ssize_t
copy_op_arg(_Py_CODEUNIT * codestr,Py_ssize_t i,unsigned char op,unsigned int oparg,Py_ssize_t maxi)112 copy_op_arg(_Py_CODEUNIT *codestr, Py_ssize_t i, unsigned char op,
113             unsigned int oparg, Py_ssize_t maxi)
114 {
115     int ilen = instrsize(oparg);
116     if (i + ilen > maxi) {
117         return -1;
118     }
119     write_op_arg(codestr + maxi - ilen, op, oparg, ilen);
120     fill_nops(codestr, i, maxi - ilen);
121     return maxi - 1;
122 }
123 
124 /* Replace LOAD_CONST c1, LOAD_CONST c2 ... LOAD_CONST cn, BUILD_TUPLE n
125    with    LOAD_CONST (c1, c2, ... cn).
126    The consts table must still be in list form so that the
127    new constant (c1, c2, ... cn) can be appended.
128    Called with codestr pointing to the first LOAD_CONST.
129 */
130 static Py_ssize_t
fold_tuple_on_constants(_Py_CODEUNIT * codestr,Py_ssize_t codelen,Py_ssize_t c_start,Py_ssize_t opcode_end,PyObject * consts,int n)131 fold_tuple_on_constants(_Py_CODEUNIT *codestr, Py_ssize_t codelen,
132                         Py_ssize_t c_start, Py_ssize_t opcode_end,
133                         PyObject *consts, int n)
134 {
135     /* Pre-conditions */
136     assert(PyList_CheckExact(consts));
137 
138     /* Buildup new tuple of constants */
139     PyObject *newconst = PyTuple_New(n);
140     if (newconst == NULL) {
141         return -1;
142     }
143 
144     for (Py_ssize_t i = 0, pos = c_start; i < n; i++, pos++) {
145         assert(pos < opcode_end);
146         pos = find_op(codestr, codelen, pos);
147         assert(_Py_OPCODE(codestr[pos]) == LOAD_CONST);
148 
149         unsigned int arg = get_arg(codestr, pos);
150         PyObject *constant = PyList_GET_ITEM(consts, arg);
151         Py_INCREF(constant);
152         PyTuple_SET_ITEM(newconst, i, constant);
153     }
154 
155     Py_ssize_t index = PyList_GET_SIZE(consts);
156 #if SIZEOF_SIZE_T > SIZEOF_INT
157     if ((size_t)index >= UINT_MAX - 1) {
158         Py_DECREF(newconst);
159         PyErr_SetString(PyExc_OverflowError, "too many constants");
160         return -1;
161     }
162 #endif
163 
164     /* Append folded constant onto consts */
165     if (PyList_Append(consts, newconst)) {
166         Py_DECREF(newconst);
167         return -1;
168     }
169     Py_DECREF(newconst);
170 
171     return copy_op_arg(codestr, c_start, LOAD_CONST,
172                        (unsigned int)index, opcode_end);
173 }
174 
175 static unsigned int *
markblocks(_Py_CODEUNIT * code,Py_ssize_t len)176 markblocks(_Py_CODEUNIT *code, Py_ssize_t len)
177 {
178     unsigned int *blocks = PyMem_New(unsigned int, len);
179     int i, j, opcode, blockcnt = 0;
180 
181     if (blocks == NULL) {
182         PyErr_NoMemory();
183         return NULL;
184     }
185     memset(blocks, 0, len*sizeof(int));
186 
187     /* Mark labels in the first pass */
188     for (i = 0; i < len; i++) {
189         opcode = _Py_OPCODE(code[i]);
190         switch (opcode) {
191             case FOR_ITER:
192             case JUMP_FORWARD:
193             case JUMP_IF_FALSE_OR_POP:
194             case JUMP_IF_TRUE_OR_POP:
195             case POP_JUMP_IF_FALSE:
196             case POP_JUMP_IF_TRUE:
197             case JUMP_IF_NOT_EXC_MATCH:
198             case JUMP_ABSOLUTE:
199             case SETUP_FINALLY:
200             case SETUP_WITH:
201             case SETUP_ASYNC_WITH:
202                 j = GETJUMPTGT(code, i);
203                 assert(j < len);
204                 blocks[j] = 1;
205                 break;
206         }
207     }
208     /* Build block numbers in the second pass */
209     for (i = 0; i < len; i++) {
210         blockcnt += blocks[i];          /* increment blockcnt over labels */
211         blocks[i] = blockcnt;
212     }
213     return blocks;
214 }
215 
216 /* Perform basic peephole optimizations to components of a code object.
217    The consts object should still be in list form to allow new constants
218    to be appended.
219 
220    To keep the optimizer simple, it bails when the lineno table has complex
221    encoding for gaps >= 255.
222 
223    Optimizations are restricted to simple transformations occurring within a
224    single basic block.  All transformations keep the code size the same or
225    smaller.  For those that reduce size, the gaps are initially filled with
226    NOPs.  Later those NOPs are removed and the jump addresses retargeted in
227    a single pass. */
228 
229 PyObject *
PyCode_Optimize(PyObject * code,PyObject * consts,PyObject * names,PyObject * lnotab_obj)230 PyCode_Optimize(PyObject *code, PyObject* consts, PyObject *names,
231                 PyObject *lnotab_obj)
232 {
233     Py_ssize_t h, i, nexti, op_start, tgt;
234     unsigned int j, nops;
235     unsigned char opcode, nextop;
236     _Py_CODEUNIT *codestr = NULL;
237     unsigned char *lnotab;
238     unsigned int cum_orig_offset, last_offset;
239     Py_ssize_t tabsiz;
240     // Count runs of consecutive LOAD_CONSTs
241     unsigned int cumlc = 0, lastlc = 0;
242     unsigned int *blocks = NULL;
243 
244     /* Bail out if an exception is set */
245     if (PyErr_Occurred())
246         goto exitError;
247 
248     /* Bypass optimization when the lnotab table is too complex */
249     assert(PyBytes_Check(lnotab_obj));
250     lnotab = (unsigned char*)PyBytes_AS_STRING(lnotab_obj);
251     tabsiz = PyBytes_GET_SIZE(lnotab_obj);
252     assert(tabsiz == 0 || Py_REFCNT(lnotab_obj) == 1);
253 
254     /* Don't optimize if lnotab contains instruction pointer delta larger
255        than +255 (encoded as multiple bytes), just to keep the peephole optimizer
256        simple. The optimizer leaves line number deltas unchanged. */
257 
258     for (i = 0; i < tabsiz; i += 2) {
259         if (lnotab[i] == 255) {
260             goto exitUnchanged;
261         }
262     }
263 
264     assert(PyBytes_Check(code));
265     Py_ssize_t codesize = PyBytes_GET_SIZE(code);
266     assert(codesize % sizeof(_Py_CODEUNIT) == 0);
267     Py_ssize_t codelen = codesize / sizeof(_Py_CODEUNIT);
268     if (codelen > INT_MAX) {
269         /* Python assembler is limited to INT_MAX: see assembler.a_offset in
270            compile.c. */
271         goto exitUnchanged;
272     }
273 
274     /* Make a modifiable copy of the code string */
275     codestr = (_Py_CODEUNIT *)PyMem_Malloc(codesize);
276     if (codestr == NULL) {
277         PyErr_NoMemory();
278         goto exitError;
279     }
280     memcpy(codestr, PyBytes_AS_STRING(code), codesize);
281 
282     blocks = markblocks(codestr, codelen);
283     if (blocks == NULL)
284         goto exitError;
285     assert(PyList_Check(consts));
286 
287     for (i=find_op(codestr, codelen, 0) ; i<codelen ; i=nexti) {
288         opcode = _Py_OPCODE(codestr[i]);
289         op_start = i;
290         while (op_start >= 1 && _Py_OPCODE(codestr[op_start-1]) == EXTENDED_ARG) {
291             op_start--;
292         }
293 
294         nexti = i + 1;
295         while (nexti < codelen && _Py_OPCODE(codestr[nexti]) == EXTENDED_ARG)
296             nexti++;
297         nextop = nexti < codelen ? _Py_OPCODE(codestr[nexti]) : 0;
298 
299         lastlc = cumlc;
300         cumlc = 0;
301 
302         switch (opcode) {
303                 /* Skip over LOAD_CONST trueconst
304                    POP_JUMP_IF_FALSE xx.  This improves
305                    "while 1" performance.  */
306             case LOAD_CONST:
307                 cumlc = lastlc + 1;
308                 if (nextop != POP_JUMP_IF_FALSE  ||
309                     !ISBASICBLOCK(blocks, op_start, i + 1)) {
310                     break;
311                 }
312                 PyObject* cnt = PyList_GET_ITEM(consts, get_arg(codestr, i));
313                 int is_true = PyObject_IsTrue(cnt);
314                 if (is_true == -1) {
315                     goto exitError;
316                 }
317                 if (is_true == 1) {
318                     fill_nops(codestr, op_start, nexti + 1);
319                     cumlc = 0;
320                 }
321                 break;
322 
323                 /* Try to fold tuples of constants.
324                    Skip over BUILD_SEQN 1 UNPACK_SEQN 1.
325                    Replace BUILD_SEQN 2 UNPACK_SEQN 2 with ROT2.
326                    Replace BUILD_SEQN 3 UNPACK_SEQN 3 with ROT3 ROT2. */
327             case BUILD_TUPLE:
328                 j = get_arg(codestr, i);
329                 if (j > 0 && lastlc >= j) {
330                     h = lastn_const_start(codestr, op_start, j);
331                     if (ISBASICBLOCK(blocks, h, op_start)) {
332                         h = fold_tuple_on_constants(codestr, codelen,
333                                                     h, i+1, consts, j);
334                         break;
335                     }
336                 }
337                 if (nextop != UNPACK_SEQUENCE  ||
338                     !ISBASICBLOCK(blocks, op_start, i + 1) ||
339                     j != get_arg(codestr, nexti))
340                     break;
341                 if (j < 2) {
342                     fill_nops(codestr, op_start, nexti + 1);
343                 } else if (j == 2) {
344                     codestr[op_start] = PACKOPARG(ROT_TWO, 0);
345                     fill_nops(codestr, op_start + 1, nexti + 1);
346                 } else if (j == 3) {
347                     codestr[op_start] = PACKOPARG(ROT_THREE, 0);
348                     codestr[op_start + 1] = PACKOPARG(ROT_TWO, 0);
349                     fill_nops(codestr, op_start + 2, nexti + 1);
350                 }
351                 break;
352 
353                 /* Simplify conditional jump to conditional jump where the
354                    result of the first test implies the success of a similar
355                    test or the failure of the opposite test.
356                    Arises in code like:
357                    "a and b or c"
358                    "(a and b) and c"
359                    "(a or b) or c"
360                    "(a or b) and c"
361                    x:JUMP_IF_FALSE_OR_POP y   y:JUMP_IF_FALSE_OR_POP z
362                       -->  x:JUMP_IF_FALSE_OR_POP z
363                    x:JUMP_IF_FALSE_OR_POP y   y:JUMP_IF_TRUE_OR_POP z
364                       -->  x:POP_JUMP_IF_FALSE y+1
365                    where y+1 is the instruction following the second test.
366                 */
367             case JUMP_IF_FALSE_OR_POP:
368             case JUMP_IF_TRUE_OR_POP:
369                 h = get_arg(codestr, i) / sizeof(_Py_CODEUNIT);
370                 tgt = find_op(codestr, codelen, h);
371 
372                 j = _Py_OPCODE(codestr[tgt]);
373                 if (CONDITIONAL_JUMP(j)) {
374                     /* NOTE: all possible jumps here are absolute. */
375                     if (JUMPS_ON_TRUE(j) == JUMPS_ON_TRUE(opcode)) {
376                         /* The second jump will be taken iff the first is.
377                            The current opcode inherits its target's
378                            stack effect */
379                         h = set_arg(codestr, i, get_arg(codestr, tgt));
380                     } else {
381                         /* The second jump is not taken if the first is (so
382                            jump past it), and all conditional jumps pop their
383                            argument when they're not taken (so change the
384                            first jump to pop its argument when it's taken). */
385                         Py_ssize_t arg = (tgt + 1);
386                         /* cannot overflow: codelen <= INT_MAX */
387                         assert((size_t)arg <= UINT_MAX / sizeof(_Py_CODEUNIT));
388                         arg *= sizeof(_Py_CODEUNIT);
389                         h = set_arg(codestr, i, (unsigned int)arg);
390                         j = opcode == JUMP_IF_TRUE_OR_POP ?
391                             POP_JUMP_IF_TRUE : POP_JUMP_IF_FALSE;
392                     }
393 
394                     if (h >= 0) {
395                         nexti = h;
396                         codestr[nexti] = PACKOPARG(j, _Py_OPARG(codestr[nexti]));
397                         break;
398                     }
399                 }
400                 /* Intentional fallthrough */
401 
402                 /* Replace jumps to unconditional jumps */
403             case POP_JUMP_IF_FALSE:
404             case POP_JUMP_IF_TRUE:
405             case JUMP_FORWARD:
406             case JUMP_ABSOLUTE:
407                 h = GETJUMPTGT(codestr, i);
408                 tgt = find_op(codestr, codelen, h);
409                 /* Replace JUMP_* to a RETURN into just a RETURN */
410                 if (UNCONDITIONAL_JUMP(opcode) &&
411                     _Py_OPCODE(codestr[tgt]) == RETURN_VALUE) {
412                     codestr[op_start] = PACKOPARG(RETURN_VALUE, 0);
413                     fill_nops(codestr, op_start + 1, i + 1);
414                 } else if (UNCONDITIONAL_JUMP(_Py_OPCODE(codestr[tgt]))) {
415                     size_t arg = GETJUMPTGT(codestr, tgt);
416                     if (opcode == JUMP_FORWARD) { /* JMP_ABS can go backwards */
417                         opcode = JUMP_ABSOLUTE;
418                     } else if (!ABSOLUTE_JUMP(opcode)) {
419                         if (arg < (size_t)(i + 1)) {
420                             break;           /* No backward relative jumps */
421                         }
422                         arg -= i + 1;          /* Calc relative jump addr */
423                     }
424                     /* cannot overflow: codelen <= INT_MAX */
425                     assert(arg <= (UINT_MAX / sizeof(_Py_CODEUNIT)));
426                     arg *= sizeof(_Py_CODEUNIT);
427                     copy_op_arg(codestr, op_start, opcode,
428                                 (unsigned int)arg, i + 1);
429                 }
430                 break;
431 
432                 /* Remove unreachable ops after RETURN */
433             case RETURN_VALUE:
434                 h = i + 1;
435                 while (h < codelen && ISBASICBLOCK(blocks, i, h))
436                 {
437                     /* Leave SETUP_FINALLY and RERAISE in place to help find block limits. */
438                     if (_Py_OPCODE(codestr[h]) == SETUP_FINALLY || _Py_OPCODE(codestr[h]) == RERAISE) {
439                         while (h > i + 1 &&
440                                _Py_OPCODE(codestr[h - 1]) == EXTENDED_ARG)
441                         {
442                             h--;
443                         }
444                         break;
445                     }
446                     h++;
447                 }
448                 if (h > i + 1) {
449                     fill_nops(codestr, i + 1, h);
450                     nexti = find_op(codestr, codelen, h);
451                 }
452                 break;
453         }
454     }
455 
456     /* Fixup lnotab */
457     for (i = 0, nops = 0; i < codelen; i++) {
458         size_t block = (size_t)i - nops;
459         /* cannot overflow: codelen <= INT_MAX */
460         assert(block <= UINT_MAX);
461         /* original code offset => new code offset */
462         blocks[i] = (unsigned int)block;
463         if (_Py_OPCODE(codestr[i]) == NOP) {
464             nops++;
465         }
466     }
467     cum_orig_offset = 0;
468     last_offset = 0;
469     for (i=0 ; i < tabsiz ; i+=2) {
470         unsigned int offset_delta, new_offset;
471         cum_orig_offset += lnotab[i];
472         assert(cum_orig_offset % sizeof(_Py_CODEUNIT) == 0);
473         new_offset = blocks[cum_orig_offset / sizeof(_Py_CODEUNIT)] *
474                 sizeof(_Py_CODEUNIT);
475         offset_delta = new_offset - last_offset;
476         assert(offset_delta <= 255);
477         lnotab[i] = (unsigned char)offset_delta;
478         last_offset = new_offset;
479     }
480 
481     /* Remove NOPs and fixup jump targets */
482     for (op_start = i = h = 0; i < codelen; i++, op_start = i) {
483         j = _Py_OPARG(codestr[i]);
484         while (_Py_OPCODE(codestr[i]) == EXTENDED_ARG) {
485             i++;
486             j = j<<8 | _Py_OPARG(codestr[i]);
487         }
488         opcode = _Py_OPCODE(codestr[i]);
489         switch (opcode) {
490             case NOP:continue;
491 
492             case JUMP_ABSOLUTE:
493             case POP_JUMP_IF_FALSE:
494             case POP_JUMP_IF_TRUE:
495             case JUMP_IF_FALSE_OR_POP:
496             case JUMP_IF_TRUE_OR_POP:
497             case JUMP_IF_NOT_EXC_MATCH:
498                 j = blocks[j / sizeof(_Py_CODEUNIT)] * sizeof(_Py_CODEUNIT);
499                 break;
500 
501             case FOR_ITER:
502             case JUMP_FORWARD:
503             case SETUP_FINALLY:
504             case SETUP_WITH:
505             case SETUP_ASYNC_WITH:
506                 j = blocks[j / sizeof(_Py_CODEUNIT) + i + 1] - blocks[i] - 1;
507                 j *= sizeof(_Py_CODEUNIT);
508                 break;
509         }
510         Py_ssize_t ilen = i - op_start + 1;
511         if (instrsize(j) > ilen) {
512             goto exitUnchanged;
513         }
514         /* If instrsize(j) < ilen, we'll emit EXTENDED_ARG 0 */
515         if (ilen > 4) {
516             /* Can only happen when PyCode_Optimize() is called with
517                malformed bytecode. */
518             goto exitUnchanged;
519         }
520         write_op_arg(codestr + h, opcode, j, (int)ilen);
521         h += ilen;
522     }
523     assert(h + (Py_ssize_t)nops == codelen);
524 
525     PyMem_Free(blocks);
526     code = PyBytes_FromStringAndSize((char *)codestr, h * sizeof(_Py_CODEUNIT));
527     PyMem_Free(codestr);
528     return code;
529 
530  exitError:
531     code = NULL;
532 
533  exitUnchanged:
534     Py_XINCREF(code);
535     PyMem_Free(blocks);
536     PyMem_Free(codestr);
537     return code;
538 }
539