1 /*
2  * %CopyrightBegin%
3  *
4  * Copyright Ericsson AB 2020-2020. All Rights Reserved.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  * %CopyrightEnd%
19  */
20 
21 #include <string>
22 #include <vector>
23 #include <unordered_map>
24 #include <map>
25 
26 #ifndef ASMJIT_ASMJIT_H_INCLUDED
27 #    include <asmjit/asmjit.hpp>
28 #endif
29 
30 extern "C"
31 {
32 #ifdef HAVE_CONFIG_H
33 #    include "config.h"
34 #endif
35 
36 #include "sys.h"
37 #include "erl_vm.h"
38 #include "global.h"
39 #include "beam_catches.h"
40 
41 #include "beam_asm.h"
42 }
43 
44 #include "beam_jit_common.hpp"
45 
46 using namespace asmjit;
47 
48 class BeamAssembler : public ErrorHandler {
49 protected:
50     /* Holds code and relocation information. */
51     CodeHolder code;
52 
53     /* TODO: Want to change this to x86::Builder in order to be able to patch
54      * the correct I into the code after code generation */
55     x86::Assembler a;
56 
57     FileLogger logger;
58 
59     Section *rodata = nullptr;
60 
61     /* * * * * * * * * */
62 
63     /* Points at x_reg_array inside an ErtsSchedulerRegisters struct, allowing
64      * the aux_regs field to be addressed with an 8-bit displacement. */
65     const x86::Gp registers = x86::rbx;
66 
67 #ifdef NATIVE_ERLANG_STACK
68     /* The Erlang stack pointer, note that it uses RSP and is therefore invalid
69      * when running on the runtime stack. */
70     const x86::Gp E = x86::rsp;
71 
72     /* Cached copy of Erlang stack pointer used to speed up stack switches when
73      * we know that the runtime doesn't read or modify the Erlang stack.
74      *
75      * If we find ourselves pressed for registers in the future, we could save
76      * this in the same slot as `registers` as that can be trivially recomputed
77      * from the top of the runtime stack. */
78     const x86::Gp E_saved = x86::r12;
79 
80 #else
81     const x86::Gp E = x86::r12;
82 #endif
83 
84     const x86::Gp c_p = x86::r13;
85     const x86::Gp FCALLS = x86::r14;
86     const x86::Gp HTOP = x86::r15;
87 
88     /* Local copy of the active code index.
89      *
90      * This is set to ERTS_SAVE_CALLS_CODE_IX when save_calls is active, which
91      * routes us to a common handler routine that calls save_calls before
92      * jumping to the actual code. */
93     const x86::Gp active_code_ix = x86::rbp;
94 
95 #ifdef ERTS_MSACC_EXTENDED_STATES
96     const x86::Mem erts_msacc_cache = getSchedulerRegRef(
97             offsetof(ErtsSchedulerRegisters, aux_regs.d.erts_msacc_cache));
98 #endif
99 
100     /* * * * * * * * * */
101 #ifdef WIN32
102     const x86::Gp ARG1 = x86::rcx;
103     const x86::Gp ARG2 = x86::rdx;
104     const x86::Gp ARG3 = x86::r8;
105     const x86::Gp ARG4 = x86::r9;
106     const x86::Gp ARG5 = x86::r10;
107     const x86::Gp ARG6 = x86::r11;
108 
109     const x86::Gp ARG1d = x86::ecx;
110     const x86::Gp ARG2d = x86::edx;
111     const x86::Gp ARG3d = x86::r8d;
112     const x86::Gp ARG4d = x86::r9d;
113     const x86::Gp ARG5d = x86::r10d;
114     const x86::Gp ARG6d = x86::r11d;
115 #else
116     const x86::Gp ARG1 = x86::rdi;
117     const x86::Gp ARG2 = x86::rsi;
118     const x86::Gp ARG3 = x86::rdx;
119     const x86::Gp ARG4 = x86::rcx;
120     const x86::Gp ARG5 = x86::r8;
121     const x86::Gp ARG6 = x86::r9;
122 
123     const x86::Gp ARG1d = x86::edi;
124     const x86::Gp ARG2d = x86::esi;
125     const x86::Gp ARG3d = x86::edx;
126     const x86::Gp ARG4d = x86::ecx;
127     const x86::Gp ARG5d = x86::r8d;
128     const x86::Gp ARG6d = x86::r9d;
129 #endif
130 
131     const x86::Gp RET = x86::rax;
132     const x86::Gp RETd = x86::eax;
133     const x86::Gp RETb = x86::al;
134 
135     const x86::Mem TMP_MEM1q = getSchedulerRegRef(
136             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[0]));
137     const x86::Mem TMP_MEM2q = getSchedulerRegRef(
138             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[1]));
139     const x86::Mem TMP_MEM3q = getSchedulerRegRef(
140             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[2]));
141     const x86::Mem TMP_MEM4q = getSchedulerRegRef(
142             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[3]));
143     const x86::Mem TMP_MEM5q = getSchedulerRegRef(
144             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[4]));
145 
146     const x86::Mem TMP_MEM1d = getSchedulerRegRef(
147             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[0]),
148             sizeof(Uint32));
149     const x86::Mem TMP_MEM2d = getSchedulerRegRef(
150             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[1]),
151             sizeof(Uint32));
152     const x86::Mem TMP_MEM3d = getSchedulerRegRef(
153             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[2]),
154             sizeof(Uint32));
155     const x86::Mem TMP_MEM4d = getSchedulerRegRef(
156             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[3]),
157             sizeof(Uint32));
158     const x86::Mem TMP_MEM5d = getSchedulerRegRef(
159             offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[4]),
160             sizeof(Uint32));
161 
162     enum Distance { dShort, dLong };
163 
164 public:
165     static bool hasCpuFeature(uint32_t featureId);
166 
BeamAssembler()167     BeamAssembler() : code() {
168         /* Setup with default code info */
169         Error err = code.init(hostEnvironment());
170         ERTS_ASSERT(!err && "Failed to init codeHolder");
171 
172         err = code.newSection(&rodata,
173                               ".rodata",
174                               SIZE_MAX,
175                               Section::kFlagConst,
176                               8);
177         ERTS_ASSERT(!err && "Failed to create .rodata section");
178 
179         err = code.attach(&a);
180 
181         ERTS_ASSERT(!err && "Failed to attach codeHolder");
182 #ifdef DEBUG
183         a.addValidationOptions(BaseEmitter::kValidationOptionAssembler);
184 #endif
185         a.addEncodingOptions(BaseEmitter::kEncodingOptionOptimizeForSize);
186         code.setErrorHandler(this);
187     }
188 
BeamAssembler(const std::string & log)189     BeamAssembler(const std::string &log) : BeamAssembler() {
190         if (erts_jit_asm_dump) {
191             setLogger(log + ".asm");
192         }
193     }
194 
~BeamAssembler()195     ~BeamAssembler() {
196         if (logger.file())
197             fclose(logger.file());
198     }
199 
getBaseAddress()200     void *getBaseAddress() {
201         ASSERT(code.hasBaseAddress());
202         return (void *)code.baseAddress();
203     }
204 
getOffset()205     size_t getOffset() {
206         return a.offset();
207     }
208 
209 protected:
_codegen(JitAllocator * allocator,const void ** executable_ptr,void ** writable_ptr)210     void _codegen(JitAllocator *allocator,
211                   const void **executable_ptr,
212                   void **writable_ptr) {
213         Error err = code.flatten();
214         ERTS_ASSERT(!err && "Could not flatten code");
215         err = code.resolveUnresolvedLinks();
216         ERTS_ASSERT(!err && "Could not resolve all links");
217 
218         /* Verify that all labels are bound */
219 #ifdef DEBUG
220         for (auto e : code.labelEntries()) {
221             if (!e->isBound()) {
222                 erts_exit(ERTS_ABORT_EXIT, "Label %s is not bound", e->name());
223             }
224         }
225 #endif
226 
227         err = allocator->alloc(const_cast<void **>(executable_ptr),
228                                writable_ptr,
229                                code.codeSize() + 16);
230 
231         if (err == ErrorCode::kErrorTooManyHandles) {
232             ERTS_ASSERT(!"Failed to allocate module code: "
233                          "out of file descriptors");
234         } else if (err) {
235             ERTS_ASSERT("Failed to allocate module code");
236         }
237 
238         code.relocateToBase((uint64_t)*executable_ptr);
239         code.copyFlattenedData(*writable_ptr,
240                                code.codeSize(),
241                                CodeHolder::kCopyPadSectionBuffer);
242 #ifdef DEBUG
243         if (FileLogger *l = dynamic_cast<FileLogger *>(code.logger()))
244             if (FILE *f = l->file())
245                 fprintf(f, "; CODE_SIZE: %zd\n", code.codeSize());
246 #endif
247     }
248 
getCode(Label label)249     void *getCode(Label label) {
250         ASSERT(label.isValid());
251         return (char *)getBaseAddress() + code.labelOffsetFromBase(label);
252     }
253 
getCode(char * labelName)254     byte *getCode(char *labelName) {
255         return (byte *)getCode(code.labelByName(labelName, strlen(labelName)));
256     }
257 
handleError(Error err,const char * message,BaseEmitter * origin)258     void handleError(Error err, const char *message, BaseEmitter *origin) {
259         comment(message);
260         fflush(logger.file());
261         ASSERT(0 && "Fault instruction encode");
262     }
263 
getRuntimeStackRef() const264     constexpr x86::Mem getRuntimeStackRef() const {
265         int base = offsetof(ErtsSchedulerRegisters, aux_regs.d.runtime_stack);
266 
267         return getSchedulerRegRef(base);
268     }
269 
270 #if !defined(NATIVE_ERLANG_STACK)
271 #    ifdef JIT_HARD_DEBUG
getInitialSPRef() const272     constexpr x86::Mem getInitialSPRef() const {
273         int base = offsetof(ErtsSchedulerRegisters, initial_sp);
274 
275         return getSchedulerRegRef(base);
276     }
277 #    endif
278 
getCPRef() const279     constexpr x86::Mem getCPRef() const {
280         return x86::qword_ptr(E);
281     }
282 #endif
283 
getSchedulerRegRef(int offset,size_t size=sizeof (UWord)) const284     constexpr x86::Mem getSchedulerRegRef(int offset,
285                                           size_t size = sizeof(UWord)) const {
286         const int x_reg_offset =
287                 offsetof(ErtsSchedulerRegisters, x_reg_array.d);
288 
289         /* The entire aux_reg field should be addressable with an 8-bit
290          * displacement. */
291         ERTS_CT_ASSERT(x_reg_offset <= 128);
292 
293         return x86::Mem(registers, offset - x_reg_offset, size);
294     }
295 
getFRef(int index,size_t size=sizeof (UWord)) const296     constexpr x86::Mem getFRef(int index, size_t size = sizeof(UWord)) const {
297         int base = offsetof(ErtsSchedulerRegisters, f_reg_array.d);
298         int offset = index * sizeof(FloatDef);
299 
300         ASSERT(index >= 0 && index <= 1023);
301         return getSchedulerRegRef(base + offset, size);
302     }
303 
getXRef(int index,size_t size=sizeof (UWord)) const304     constexpr x86::Mem getXRef(int index, size_t size = sizeof(UWord)) const {
305         int base = offsetof(ErtsSchedulerRegisters, x_reg_array.d);
306         int offset = index * sizeof(Eterm);
307 
308         ASSERT(index >= 0 && index < ERTS_X_REGS_ALLOCATED);
309         return getSchedulerRegRef(base + offset, size);
310     }
311 
getYRef(int index,size_t size=sizeof (UWord)) const312     constexpr x86::Mem getYRef(int index, size_t size = sizeof(UWord)) const {
313         ASSERT(index >= 0 && index <= 1023);
314 
315 #ifdef NATIVE_ERLANG_STACK
316         return x86::Mem(E, index * sizeof(Eterm), size);
317 #else
318         return x86::Mem(E, (index + CP_SIZE) * sizeof(Eterm), size);
319 #endif
320     }
321 
getCARRef(x86::Gp Src,size_t size=sizeof (UWord)) const322     constexpr x86::Mem getCARRef(x86::Gp Src,
323                                  size_t size = sizeof(UWord)) const {
324         return x86::Mem(Src, -TAG_PRIMARY_LIST, size);
325     }
326 
getCDRRef(x86::Gp Src,size_t size=sizeof (UWord)) const327     constexpr x86::Mem getCDRRef(x86::Gp Src,
328                                  size_t size = sizeof(UWord)) const {
329         return x86::Mem(Src, -TAG_PRIMARY_LIST + sizeof(Eterm), size);
330     }
331 
align_erlang_cp()332     void align_erlang_cp() {
333         /* Align so that the current address forms a valid CP. */
334         ERTS_CT_ASSERT(_CPMASK == 3);
335         a.align(kAlignCode, 4);
336         ASSERT(is_CP(a.offset()));
337     }
338 
load_x_reg_array(x86::Gp reg)339     void load_x_reg_array(x86::Gp reg) {
340         /* By definition. */
341         a.mov(reg, registers);
342     }
343 
load_erl_bits_state(x86::Gp reg)344     void load_erl_bits_state(x86::Gp reg) {
345         int offset =
346                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state);
347 
348         a.lea(reg, getSchedulerRegRef(offset));
349     }
350 
351     /* Ensure that the Erlang stack is used and the redzone is unused.
352      * We combine those test to minimize the number of instructions.
353      */
emit_assert_redzone_unused()354     void emit_assert_redzone_unused() {
355 #ifdef JIT_HARD_DEBUG
356         const int REDZONE_BYTES = S_REDZONE * sizeof(Eterm);
357         Label ok = a.newLabel(), crash = a.newLabel();
358 
359         /* We modify the stack pointer to avoid spilling into a register,
360          * TMP_MEM, or using the stack. */
361         a.sub(E, imm(REDZONE_BYTES));
362         a.cmp(HTOP, E);
363         a.short_().ja(crash);
364         a.cmp(E, x86::qword_ptr(c_p, offsetof(Process, hend)));
365         a.short_().jle(ok);
366 
367         a.bind(crash);
368         a.ud2();
369 
370         a.bind(ok);
371         a.add(E, imm(REDZONE_BYTES));
372 #endif
373     }
374 
375     /*
376      * Calls an Erlang function.
377      */
378     template<typename Any>
erlang_call(Any Target,const x86::Gp & spill)379     void erlang_call(Any Target, const x86::Gp &spill) {
380 #ifdef NATIVE_ERLANG_STACK
381         /* We use the Erlang stack as the native stack. We can use a
382          * native `call` instruction. */
383         emit_assert_redzone_unused();
384         aligned_call(Target);
385 #else
386         Label next = a.newLabel();
387 
388         /* Save the return CP on the stack. */
389         a.lea(spill, x86::qword_ptr(next));
390         a.mov(getCPRef(), spill);
391 
392         a.jmp(Target);
393 
394         /* Need to align this label in order for it to be recognized as
395          * is_CP. */
396         align_erlang_cp();
397         a.bind(next);
398 #endif
399     }
400 
401     /*
402      * Calls the given address in shared fragment, ensuring that the
403      * redzone is unused and that the return address forms a valid
404      * CP.
405      */
406     template<typename Any>
fragment_call(Any Target)407     void fragment_call(Any Target) {
408         emit_assert_redzone_unused();
409 
410 #if defined(JIT_HARD_DEBUG) && !defined(NATIVE_ERLANG_STACK)
411         /* Verify that the stack has not grown. */
412         Label next = a.newLabel();
413         a.cmp(x86::rsp, getInitialSPRef());
414         a.short_().je(next);
415         a.ud2();
416         a.bind(next);
417 #endif
418 
419         aligned_call(Target);
420     }
421 
422     /*
423      * Calls the given function pointer. In a debug build with
424      * JIT_HARD_DEBUG defined, it will be enforced that the redzone is
425      * unused.
426      *
427      * The return will NOT be aligned, and thus will not form a valid
428      * CP. That means that call code must not scan the stack in any
429      * way. That means, for example, that the called code must not
430      * throw an exception, do a garbage collection, or cause a context
431      * switch.
432      */
safe_fragment_call(void (* Target)())433     void safe_fragment_call(void (*Target)()) {
434         emit_assert_redzone_unused();
435         a.call(imm(Target));
436     }
437 
438     template<typename FuncPtr>
aligned_call(FuncPtr (* target))439     void aligned_call(FuncPtr(*target)) {
440         /* Calls to absolute addresses (encoded in the address table) are
441          * always 6 bytes long. */
442         aligned_call(imm(target), 6);
443     }
444 
aligned_call(Label target)445     void aligned_call(Label target) {
446         /* Relative calls are always 5 bytes long. */
447         aligned_call(target, 5);
448     }
449 
450     template<typename OperandType>
aligned_call(OperandType target)451     void aligned_call(OperandType target) {
452         /* Other calls are variable size. While it would be nice to use this
453          * method for pointer/label calls too, `asmjit` writes relocations into
454          * the code buffer itself and overwriting them causes all kinds of
455          * havoc. */
456         size_t call_offset, call_size;
457 
458         call_offset = a.offset();
459         a.call(target);
460 
461         call_size = a.offset() - call_offset;
462         a.setOffset(call_offset);
463 
464         aligned_call(target, call_size);
465     }
466 
467     /* Calls the given address, ensuring that the return address forms a valid
468      * CP. */
469     template<typename OperandType>
aligned_call(OperandType target,size_t size)470     void aligned_call(OperandType target, size_t size) {
471         /* The return address must be 4-byte aligned to form a valid CP, so
472          * we'll align according to the size of the call instruction. */
473         ssize_t next_address = (a.offset() + size);
474 
475         ERTS_CT_ASSERT(_CPMASK == 3);
476         if (next_address % 4) {
477             ssize_t nop_count = 4 - next_address % 4;
478 
479             a.embed(nops[nop_count - 1], nop_count);
480         }
481 
482 #ifdef JIT_HARD_DEBUG
483         /* TODO: When frame pointers are in place, assert (at runtime) that the
484          * destination has a `push rbp; mov rbp, rsp` sequence. */
485 #endif
486 
487         a.call(target);
488         ASSERT(is_CP(a.offset()));
489     }
490 
491     /* Canned instruction sequences for multi-byte NOPs */
492     static const uint8_t *nops[3];
493     static const uint8_t nop1[1];
494     static const uint8_t nop2[2];
495     static const uint8_t nop3[3];
496 
runtime_call(x86::Gp func,unsigned args)497     void runtime_call(x86::Gp func, unsigned args) {
498         ASSERT(args < 5);
499 
500         emit_assert_runtime_stack();
501 
502 #ifdef WIN32
503         a.sub(x86::rsp, imm(4 * sizeof(UWord)));
504         a.call(func);
505         a.add(x86::rsp, imm(4 * sizeof(UWord)));
506 #else
507         a.call(func);
508 #endif
509     }
510 
511     template<typename T>
512     struct function_arity;
513     template<typename T, typename... Args>
514     struct function_arity<T(Args...)>
515             : std::integral_constant<int, sizeof...(Args)> {};
516 
517     template<int expected_arity, typename T>
runtime_call(T (* func))518     void runtime_call(T(*func)) {
519         static_assert(expected_arity == function_arity<T>());
520 
521         emit_assert_runtime_stack();
522 
523 #ifdef WIN32
524         unsigned pushed;
525         switch (expected_arity) {
526         case 6:
527         case 5:
528             /* We push ARG6 to keep the stack aligned even when we only have 5
529              * arguments. It does no harm, and is slightly more compact than
530              * sub/push/sub. */
531             a.push(ARG6);
532             a.push(ARG5);
533             a.sub(x86::rsp, imm(4 * sizeof(UWord)));
534             pushed = 6;
535             break;
536         default:
537             a.sub(x86::rsp, imm(4 * sizeof(UWord)));
538             pushed = 4;
539         }
540 
541 #endif
542 
543         a.call(imm(func));
544 
545 #ifdef WIN32
546         a.add(x86::rsp, imm(pushed * sizeof(UWord)));
547 #endif
548     }
549 
550     template<typename T>
abs_jmp(T (* addr))551     void abs_jmp(T(*addr)) {
552         a.jmp(imm(addr));
553     }
554 
555     /* Explicitly position-independent absolute jump, for use in fragments that
556      * need to be memcpy'd for performance reasons (e.g. export entries) */
557     template<typename T>
pic_jmp(T (* addr))558     void pic_jmp(T(*addr)) {
559         a.mov(ARG6, imm(addr));
560         a.jmp(ARG6);
561     }
562 
getArgRef(const ArgVal & val,size_t size=sizeof (UWord)) const563     constexpr x86::Mem getArgRef(const ArgVal &val,
564                                  size_t size = sizeof(UWord)) const {
565         switch (val.getType()) {
566         case ArgVal::TYPE::l:
567             return getFRef(val.getValue(), size);
568         case ArgVal::TYPE::x:
569             return getXRef(val.getValue(), size);
570         case ArgVal::TYPE::y:
571             return getYRef(val.getValue(), size);
572         default:
573             ERTS_ASSERT(!"NYI");
574             return x86::Mem();
575         }
576     }
577 
578     /* Returns the current code address for the export entry in `Src`
579      *
580      * Export tracing, save_calls, etc is implemented by shared fragments that
581      * assume that the export entry is in RET, so we have to copy it over if it
582      * isn't already. */
emit_setup_export_call(const x86::Gp & Src)583     x86::Mem emit_setup_export_call(const x86::Gp &Src) {
584         return emit_setup_export_call(Src, active_code_ix);
585     }
586 
emit_setup_export_call(const x86::Gp & Src,const x86::Gp & CodeIndex)587     x86::Mem emit_setup_export_call(const x86::Gp &Src,
588                                     const x86::Gp &CodeIndex) {
589         if (RET != Src) {
590             a.mov(RET, Src);
591         }
592 
593         return x86::qword_ptr(RET, CodeIndex, 3, offsetof(Export, addresses));
594     }
595 
596     /* Discards a continuation pointer, including the frame pointer if
597      * applicable. */
emit_discard_cp()598     void emit_discard_cp() {
599         emit_assert_erlang_stack();
600 
601         a.add(x86::rsp, imm(CP_SIZE * sizeof(Eterm)));
602     }
603 
emit_assert_runtime_stack()604     void emit_assert_runtime_stack() {
605 #ifdef JIT_HARD_DEBUG
606         Label crash = a.newLabel(), next = a.newLabel();
607 
608 #    ifdef NATIVE_ERLANG_STACK
609         /* Ensure that we are using the runtime stack. */
610         int end_offs, start_offs;
611 
612         end_offs = offsetof(ErtsSchedulerRegisters, runtime_stack_end);
613         start_offs = offsetof(ErtsSchedulerRegisters, runtime_stack_start);
614 
615         a.cmp(E, getSchedulerRegRef(end_offs));
616         a.short_().jbe(crash);
617         a.cmp(E, getSchedulerRegRef(start_offs));
618         a.short_().ja(crash);
619 #    endif
620 
621         /* Are we 16-byte aligned? */
622         a.test(x86::rsp, (16 - 1));
623         a.short_().je(next);
624 
625         a.bind(crash);
626         a.ud2();
627 
628         a.bind(next);
629 #endif
630     }
631 
emit_assert_erlang_stack()632     void emit_assert_erlang_stack() {
633 #ifdef JIT_HARD_DEBUG
634         Label crash = a.newLabel(), next = a.newLabel();
635 
636         /* Are we term-aligned? */
637         a.test(E, imm(sizeof(Eterm) - 1));
638         a.short_().jne(crash);
639 
640         a.cmp(E, x86::qword_ptr(c_p, offsetof(Process, heap)));
641         a.short_().jl(crash);
642         a.cmp(E, x86::qword_ptr(c_p, offsetof(Process, hend)));
643         a.short_().jle(next);
644 
645         a.bind(crash);
646         a.ud2();
647         a.bind(next);
648 #endif
649     }
650 
651     enum Update : int {
652         eStack = (1 << 0),
653         eHeap = (1 << 1),
654         eReductions = (1 << 2),
655         eCodeIndex = (1 << 3)
656     };
657 
658     template<int Spec = 0>
emit_enter_runtime()659     void emit_enter_runtime() {
660         emit_assert_erlang_stack();
661 
662         ERTS_CT_ASSERT((Spec & (Update::eReductions | Update::eStack |
663                                 Update::eHeap)) == Spec);
664 
665 #ifdef NATIVE_ERLANG_STACK
666         if (!(Spec & Update::eStack)) {
667             a.mov(E_saved, E);
668         }
669 #endif
670 
671         if ((Spec & (Update::eHeap | Update::eStack)) ==
672             (Update::eHeap | Update::eStack)) {
673             /* To update both heap and stack we use sse instructions like gcc
674                -O3 does. Basically it is this function run through gcc -O3:
675 
676                struct a { long a; long b; long c; };
677 
678                void test(long a, long b, long c, struct a *s) {
679                  s->a = a;
680                  s->b = b;
681                  s->c = c;
682                }
683             */
684             ERTS_CT_ASSERT(offsetof(Process, stop) - offsetof(Process, htop) ==
685                            8);
686             a.movq(x86::xmm0, HTOP);
687             a.movq(x86::xmm1, E);
688             if (Spec & Update::eReductions) {
689                 a.mov(x86::qword_ptr(c_p, offsetof(Process, fcalls)), FCALLS);
690             }
691             a.punpcklqdq(x86::xmm0, x86::xmm1);
692             a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)), x86::xmm0);
693         } else {
694             if ((Spec & Update::eStack)) {
695                 a.mov(x86::qword_ptr(c_p, offsetof(Process, stop)), E);
696             }
697 
698             if (Spec & Update::eHeap) {
699                 a.mov(x86::qword_ptr(c_p, offsetof(Process, htop)), HTOP);
700             }
701 
702             if (Spec & Update::eReductions) {
703                 a.mov(x86::qword_ptr(c_p, offsetof(Process, fcalls)), FCALLS);
704             }
705         }
706 
707 #ifdef NATIVE_ERLANG_STACK
708         a.lea(E, getRuntimeStackRef());
709 #else
710         /* Keeping track of stack alignment across shared fragments would be
711          * too much of a maintenance burden, so we stash and align the stack
712          * pointer at runtime instead. */
713         a.mov(getRuntimeStackRef(), x86::rsp);
714 
715         a.sub(x86::rsp, imm(15));
716         a.and_(x86::rsp, imm(-16));
717 #endif
718     }
719 
720     template<int Spec = 0>
emit_leave_runtime()721     void emit_leave_runtime() {
722         emit_assert_runtime_stack();
723 
724         ERTS_CT_ASSERT((Spec & (Update::eReductions | Update::eStack |
725                                 Update::eHeap | Update::eCodeIndex)) == Spec);
726 
727 #ifdef NATIVE_ERLANG_STACK
728         if (!(Spec & Update::eStack)) {
729             a.mov(E, E_saved);
730         }
731 #endif
732         if ((Spec & Update::eStack)) {
733             a.mov(E, x86::qword_ptr(c_p, offsetof(Process, stop)));
734         }
735 
736         if (Spec & Update::eHeap) {
737             a.mov(HTOP, x86::qword_ptr(c_p, offsetof(Process, htop)));
738         }
739 
740         if (Spec & Update::eReductions) {
741             a.mov(FCALLS, x86::qword_ptr(c_p, offsetof(Process, fcalls)));
742         }
743 
744         if (Spec & Update::eCodeIndex) {
745             /* Updates the local copy of the active code index, retaining
746              * save_calls if active. */
747             a.mov(ARG1, imm(&the_active_code_index));
748             a.mov(ARG1d, x86::dword_ptr(ARG1));
749 
750             a.cmp(active_code_ix, imm(ERTS_SAVE_CALLS_CODE_IX));
751             a.cmovne(active_code_ix, ARG1);
752         }
753 
754 #if !defined(NATIVE_ERLANG_STACK)
755         /* Restore the unaligned stack pointer we saved on enter. */
756         a.mov(x86::rsp, getRuntimeStackRef());
757 #endif
758     }
759 
emit_is_boxed(Label Fail,x86::Gp Src,Distance dist=dLong)760     void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
761         /* Use the shortest possible instruction depending on the source
762          * register. */
763         if (Src == x86::rax || Src == x86::rdi || Src == x86::rsi ||
764             Src == x86::rcx || Src == x86::rdx) {
765             a.test(Src.r8(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED));
766         } else {
767             a.test(Src.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED));
768         }
769         if (dist == dShort) {
770             a.short_().jne(Fail);
771         } else {
772             a.jne(Fail);
773         }
774     }
775 
emit_ptr_val(x86::Gp Dst,x86::Gp Src)776     x86::Gp emit_ptr_val(x86::Gp Dst, x86::Gp Src) {
777 #if !defined(TAG_LITERAL_PTR)
778         return Src;
779 #else
780         if (Dst != Src) {
781             a.mov(Dst, Src);
782         }
783 
784         /* We intentionally skip TAG_PTR_MASK__ here, as we want to use
785          * plain `emit_boxed_val` when we know the argument can't be a literal,
786          * such as in bit-syntax matching.
787          *
788          * This comes at very little cost as `emit_boxed_val` nearly always has
789          * a displacement. */
790         a.and_(Dst, imm(~TAG_LITERAL_PTR));
791         return Dst;
792 #endif
793     }
794 
emit_boxed_val(x86::Gp Src,int32_t bytes=0,size_t size=sizeof (UWord)) const795     constexpr x86::Mem emit_boxed_val(x86::Gp Src,
796                                       int32_t bytes = 0,
797                                       size_t size = sizeof(UWord)) const {
798         ASSERT(bytes % sizeof(Eterm) == 0);
799         return x86::Mem(Src, bytes - TAG_PRIMARY_BOXED, size);
800     }
801 
emit_test_the_non_value(x86::Gp Reg)802     void emit_test_the_non_value(x86::Gp Reg) {
803         if (THE_NON_VALUE == 0) {
804             a.test(Reg.r32(), Reg.r32());
805         } else {
806             a.cmp(Reg, imm(THE_NON_VALUE));
807         }
808     }
809 
810     /*
811      * Generate the shortest instruction for setting a register to an immediate
812      * value. May clear flags.
813      */
mov_imm(x86::Gp to,Uint value)814     void mov_imm(x86::Gp to, Uint value) {
815         if (value == 0) {
816             /*
817              * Generate the shortest instruction to set the register to zero.
818              *
819              *   48 c7 c0 00 00 00 00    mov    rax, 0
820              *   b8 00 00 00 00          mov    eax, 0
821              *   31 c0                   xor    eax, eax
822              *
823              * Thus, "xor eax, eax" is five bytes shorter than "mov rax, 0".
824              *
825              * Note: xor clears ZF and CF; mov does not change any flags.
826              */
827             a.xor_(to.r32(), to.r32());
828         } else {
829             a.mov(to, imm(value));
830         }
831     }
832 
833 public:
834     void embed_rodata(const char *labelName, const char *buff, size_t size);
835     void embed_bss(const char *labelName, size_t size);
836 
837     void embed_zeros(size_t size);
838 
setLogger(std::string log)839     void setLogger(std::string log) {
840         FILE *f = fopen(log.data(), "w+");
841 
842         /* FIXME: Don't crash when loading multiple modules with the same name.
843          *
844          * setLogger(nullptr) disables logging. */
845         if (f) {
846             setvbuf(f, NULL, _IONBF, 0);
847         }
848 
849         setLogger(f);
850     }
851 
setLogger(FILE * log)852     void setLogger(FILE *log) {
853         logger.setFile(log);
854         logger.setIndentation(FormatOptions::kIndentationCode, 4);
855         code.setLogger(&logger);
856     }
857 
858     template<typename... Ts>
comment(const char * format,Ts...args)859     void comment(const char *format, Ts... args) {
860         if (logger.file()) {
861             char buff[1024];
862             erts_snprintf(buff, sizeof(buff), format, args...);
863             a.commentf("# %s", buff);
864         }
865     }
866 
867     struct AsmRange {
868         ErtsCodePtr start;
869         ErtsCodePtr stop;
870         std::string name;
871 
872         /* Not used yet */
873         std::string file;
874         unsigned line;
875     };
876 
877     void update_gdb_jit_info(std::string modulename,
878                              std::vector<AsmRange> &functions);
879 
embed(void * data,uint32_t size)880     void embed(void *data, uint32_t size) {
881         a.embed((char *)data, size);
882     }
883 };
884 
885 class BeamGlobalAssembler : public BeamAssembler {
886     typedef void (BeamGlobalAssembler::*emitFptr)(void);
887     typedef void (*fptr)(void);
888 
889     /* Please keep this in alphabetical order. */
890 #define BEAM_GLOBAL_FUNCS(_)                                                   \
891     _(arith_compare_shared)                                                    \
892     _(arith_eq_shared)                                                         \
893     _(bif_nif_epilogue)                                                        \
894     _(bif_element_shared)                                                      \
895     _(bif_export_trap)                                                         \
896     _(bs_add_shared)                                                           \
897     _(bs_size_check_shared)                                                    \
898     _(bs_fixed_integer_shared)                                                 \
899     _(bs_get_tail_shared)                                                      \
900     _(call_bif_shared)                                                         \
901     _(call_light_bif_shared)                                                   \
902     _(call_nif_early)                                                          \
903     _(call_nif_shared)                                                         \
904     _(catch_end_shared)                                                        \
905     _(dispatch_bif)                                                            \
906     _(dispatch_nif)                                                            \
907     _(dispatch_return)                                                         \
908     _(dispatch_save_calls)                                                     \
909     _(error_action_code)                                                       \
910     _(export_trampoline)                                                       \
911     _(garbage_collect)                                                         \
912     _(generic_bp_global)                                                       \
913     _(generic_bp_local)                                                        \
914     _(debug_bp)                                                                \
915     _(handle_error_shared_prologue)                                            \
916     _(handle_error_shared)                                                     \
917     _(handle_element_error)                                                    \
918     _(handle_hd_error)                                                         \
919     _(i_band_body_shared)                                                      \
920     _(i_band_guard_shared)                                                     \
921     _(i_bif_body_shared)                                                       \
922     _(i_bif_guard_shared)                                                      \
923     _(i_bor_body_shared)                                                       \
924     _(i_bor_guard_shared)                                                      \
925     _(i_bnot_body_shared)                                                      \
926     _(i_bnot_guard_shared)                                                     \
927     _(i_bsl_guard_shared)                                                      \
928     _(i_bsl_body_shared)                                                       \
929     _(i_bsr_guard_shared)                                                      \
930     _(i_bsr_body_shared)                                                       \
931     _(i_bxor_body_shared)                                                      \
932     _(i_bxor_guard_shared)                                                     \
933     _(i_func_info_shared)                                                      \
934     _(i_load_nif_shared)                                                       \
935     _(i_length_guard_shared)                                                   \
936     _(i_length_body_shared)                                                    \
937     _(i_loop_rec_shared)                                                       \
938     _(i_new_small_map_lit_shared)                                              \
939     _(i_test_yield_shared)                                                     \
940     _(increment_body_shared)                                                   \
941     _(int_div_rem_body_shared)                                                 \
942     _(int_div_rem_guard_shared)                                                \
943     _(minus_body_shared)                                                       \
944     _(minus_guard_shared)                                                      \
945     _(new_map_shared)                                                          \
946     _(plus_body_shared)                                                        \
947     _(plus_guard_shared)                                                       \
948     _(process_main)                                                            \
949     _(times_body_shared)                                                       \
950     _(times_guard_shared)                                                      \
951     _(unary_minus_body_shared)                                                 \
952     _(unary_minus_guard_shared)                                                \
953     _(update_map_assoc_shared)                                                 \
954     _(update_map_exact_guard_shared)                                           \
955     _(update_map_exact_body_shared)
956 
957 /* Labels exported from within process_main */
958 #define PROCESS_MAIN_LABELS(_)                                                 \
959     _(context_switch)                                                          \
960     _(context_switch_simplified)                                               \
961     _(do_schedule)
962 
963 #define DECL_ENUM(NAME) NAME,
964 
965     enum GlobalLabels : uint32_t {
966         BEAM_GLOBAL_FUNCS(DECL_ENUM) PROCESS_MAIN_LABELS(DECL_ENUM)
967     };
968 #undef DECL_ENUM
969 
970     static const std::map<GlobalLabels, emitFptr> emitPtrs;
971     static const std::map<GlobalLabels, std::string> labelNames;
972     std::unordered_map<GlobalLabels, Label> labels;
973     std::unordered_map<GlobalLabels, fptr> ptrs;
974 
975 #define DECL_FUNC(NAME) void emit_##NAME(void);
976 
977     BEAM_GLOBAL_FUNCS(DECL_FUNC);
978 #undef DECL_FUNC
979 
980     template<typename T>
981     void emit_bitwise_fallback_body(T(*func_ptr), const ErtsCodeMFA *mfa);
982 
983     template<typename T>
984     void emit_bitwise_fallback_guard(T(*func_ptr));
985 
986     x86::Mem emit_i_length_common(Label fail, int state_size);
987 
988     void emit_handle_error();
989 
990 public:
991     BeamGlobalAssembler(JitAllocator *allocator);
992 
get(GlobalLabels lbl)993     void (*get(GlobalLabels lbl))(void) {
994         ASSERT(ptrs[lbl]);
995         return ptrs[lbl];
996     }
997 
998 #define GET_CODE(NAME)                                                         \
999     void (*get_##NAME(void))() {                                               \
1000         return get(NAME);                                                      \
1001     }
1002 
1003     BEAM_GLOBAL_FUNCS(GET_CODE)
1004     PROCESS_MAIN_LABELS(GET_CODE)
1005 #undef GET_CODE
1006 };
1007 
1008 class BeamModuleAssembler : public BeamAssembler {
1009     typedef unsigned BeamLabel;
1010 
1011     /* Map of label number to asmjit Label */
1012     typedef std::unordered_map<BeamLabel, Label> LabelMap;
1013     LabelMap labels;
1014 
1015     struct patch {
1016         Label where;
1017         int64_t ptr_offs;
1018         int64_t val_offs;
1019     };
1020 
1021     struct patch_catch {
1022         struct patch patch;
1023         Label handler;
1024     };
1025     std::vector<struct patch_catch> catches;
1026 
1027     /* Map of import entry to patch labels and mfa */
1028     struct patch_import {
1029         std::vector<struct patch> patches;
1030         ErtsCodeMFA mfa;
1031     };
1032     typedef std::unordered_map<unsigned, struct patch_import> ImportMap;
1033     ImportMap imports;
1034 
1035     /* Map of fun entry to patch labels */
1036     struct patch_lambda {
1037         std::vector<struct patch> patches;
1038         ErlFunEntry fe;
1039     };
1040     typedef std::unordered_map<unsigned, struct patch_lambda> LambdaMap;
1041     LambdaMap lambdas;
1042 
1043     /* Map of literals to patch labels */
1044     struct patch_literal {
1045         std::vector<struct patch> patches;
1046     };
1047     typedef std::unordered_map<unsigned, struct patch_literal> LiteralMap;
1048     LiteralMap literals;
1049 
1050     /* All string patches */
1051     std::vector<struct patch> strings;
1052 
1053     /* All functions that have been seen so far */
1054     std::vector<BeamLabel> functions;
1055 
1056     BeamGlobalAssembler *ga;
1057 
1058     /* Used by emit to populate the labelToMFA map */
1059     Label currLabel;
1060     unsigned prev_op = 0;
1061     Label codeHeader;
1062     Label funcInfo;
1063     Label funcYield;
1064     Label genericBPTramp;
1065     Label on_load;
1066 
1067     Label floatMax;
1068     Label floatSignMask;
1069 
1070     Eterm mod;
1071 
1072     /* Save the last PC for an error. */
1073     size_t last_error_offset = 0;
1074 
1075 public:
1076     BeamModuleAssembler(BeamGlobalAssembler *ga,
1077                         Eterm mod,
1078                         unsigned num_labels);
1079     BeamModuleAssembler(BeamGlobalAssembler *ga,
1080                         Eterm mod,
1081                         unsigned num_labels,
1082                         unsigned num_functions);
1083 
1084     bool emit(unsigned op, const std::vector<ArgVal> &args);
1085 
1086     void codegen(JitAllocator *allocator,
1087                  const void **executable_ptr,
1088                  void **writable_ptr,
1089                  const BeamCodeHeader *in_hdr,
1090                  const BeamCodeHeader **out_exec_hdr,
1091                  BeamCodeHeader **out_rw_hdr);
1092 
1093     void codegen(JitAllocator *allocator,
1094                  const void **executable_ptr,
1095                  void **writable_ptr);
1096 
1097     void codegen(char *buff, size_t len);
1098 
1099     ErtsCodePtr getCode(unsigned label);
getCode(Label label)1100     void *getCode(Label label) {
1101         return BeamAssembler::getCode(label);
1102     }
getCode(char * labelName)1103     byte *getCode(char *labelName) {
1104         return BeamAssembler::getCode(labelName);
1105     }
1106 
1107     Label embed_vararg_rodata(const std::vector<ArgVal> &args, int y_offset);
1108 
getCodeSize()1109     unsigned getCodeSize() {
1110         ASSERT(code.hasBaseAddress());
1111         return code.codeSize();
1112     }
1113 
1114     void copyCodeHeader(BeamCodeHeader *hdr);
1115     BeamCodeHeader *getCodeHeader(void);
1116     const ErtsCodeInfo *getOnLoad(void);
1117 
1118     unsigned patchCatches(char *rw_base);
1119     void patchLambda(char *rw_base, unsigned index, BeamInstr I);
1120     void patchLiteral(char *rw_base, unsigned index, Eterm lit);
1121     void patchImport(char *rw_base, unsigned index, BeamInstr I);
1122     void patchStrings(char *rw_base, const byte *string);
1123 
1124 protected:
1125     /* Helpers */
1126     void emit_gc_test(const ArgVal &Stack,
1127                       const ArgVal &Heap,
1128                       const ArgVal &Live);
1129     void emit_gc_test_preserve(const ArgVal &Need,
1130                                const ArgVal &Live,
1131                                x86::Gp term);
1132 
1133     x86::Mem emit_variable_apply(bool includeI);
1134     x86::Mem emit_fixed_apply(const ArgVal &arity, bool includeI);
1135 
1136     x86::Gp emit_call_fun(const ArgVal &Fun);
1137     x86::Gp emit_apply_fun(void);
1138 
1139     void emit_is_binary(Label Fail, x86::Gp Src, Label next, Label subbin);
1140 
1141     void emit_get_list(const x86::Gp boxed_ptr,
1142                        const ArgVal &Hd,
1143                        const ArgVal &Tl);
1144 
1145     void emit_div_rem(const ArgVal &Fail,
1146                       const ArgVal &LHS,
1147                       const ArgVal &RHS,
1148                       const ErtsCodeMFA *error_mfa);
1149 
1150     void emit_setup_guard_bif(const std::vector<ArgVal> &args,
1151                               const ArgVal &bif);
1152 
1153     void emit_bif_arg_error(std::vector<ArgVal> args, const ErtsCodeMFA *mfa);
1154     void emit_error(int code);
1155 
1156     x86::Mem emit_bs_get_integer_prologue(Label next,
1157                                           Label fail,
1158                                           int flags,
1159                                           int size);
1160 
1161     int emit_bs_get_field_size(const ArgVal &Size,
1162                                int unit,
1163                                Label Fail,
1164                                const x86::Gp &out,
1165                                unsigned max_size = 0);
1166 
1167     void emit_bs_get_utf8(const ArgVal &Ctx, const ArgVal &Fail);
1168     void emit_bs_get_utf16(const ArgVal &Ctx,
1169                            const ArgVal &Fail,
1170                            const ArgVal &Flags);
1171 
1172     void emit_handle_error();
1173     void emit_handle_error(const ErtsCodeMFA *exp);
1174     void emit_handle_error(Label I, const ErtsCodeMFA *exp);
1175     void emit_validate(const ArgVal &arity);
1176     void emit_bs_skip_bits(const ArgVal &Fail, const ArgVal &Ctx);
1177 
1178     void emit_linear_search(x86::Gp val,
1179                             const ArgVal &Fail,
1180                             const std::vector<ArgVal> &args);
1181 
1182     void emit_check_float(Label next, x86::Xmm value);
1183 
1184     void emit_is_small(Label fail, x86::Gp Reg);
1185     void emit_is_both_small(Label fail, x86::Gp A, x86::Gp B);
1186 
1187     void emit_validate_unicode(Label next, Label fail, x86::Gp value);
1188 
1189     void emit_bif_is_eq_ne_exact_immed(const ArgVal &Src,
1190                                        const ArgVal &Immed,
1191                                        const ArgVal &Dst,
1192                                        Eterm fail_value,
1193                                        Eterm succ_value);
1194 
1195     void emit_proc_lc_unrequire(void);
1196     void emit_proc_lc_require(void);
1197 
1198     void emit_nyi(const char *msg);
1199     void emit_nyi(void);
1200 
1201     void emit_binsearch_nodes(size_t Left,
1202                               size_t Right,
1203                               const ArgVal &Fail,
1204                               const std::vector<ArgVal> &args);
1205 
1206     bool emit_optimized_three_way_select(const ArgVal &Fail,
1207                                          const std::vector<ArgVal> &args);
1208 
1209 #ifdef DEBUG
1210     void emit_tuple_assertion(const ArgVal &Src, x86::Gp tuple_reg);
1211 #endif
1212 
1213 #include "beamasm_protos.h"
1214 
make_move_patch(x86::Gp to,std::vector<struct patch> & patches,int64_t offset=0)1215     void make_move_patch(x86::Gp to,
1216                          std::vector<struct patch> &patches,
1217                          int64_t offset = 0) {
1218         const int MOV_IMM64_PAYLOAD_OFFSET = 2;
1219         Label lbl = a.newLabel();
1220 
1221         a.bind(lbl);
1222         a.long_().mov(to, imm(LLONG_MAX));
1223 
1224         patches.push_back({lbl, MOV_IMM64_PAYLOAD_OFFSET, offset});
1225     }
1226 
make_word_patch(std::vector<struct patch> & patches)1227     void make_word_patch(std::vector<struct patch> &patches) {
1228         Label lbl = a.newLabel();
1229         UWord word = LLONG_MAX;
1230 
1231         a.bind(lbl);
1232         a.embed(reinterpret_cast<char *>(&word), sizeof(word));
1233 
1234         patches.push_back({lbl, 0, 0});
1235     }
1236 
1237     template<typename A, typename B>
mov_arg(A to,B from)1238     void mov_arg(A to, B from) {
1239         /* We can't move to or from Y registers when we're on the runtime
1240          * stack, so we'll conservatively disallow all mov_args in the hopes of
1241          * finding such bugs sooner. */
1242         emit_assert_erlang_stack();
1243 
1244         mov_arg(to, from, ARG1);
1245     }
1246 
1247     template<typename T>
cmp_arg(T oper,const ArgVal & val)1248     void cmp_arg(T oper, const ArgVal &val) {
1249         cmp_arg(oper, val, ARG1);
1250     }
1251 
cmp_arg(x86::Mem mem,const ArgVal & val,const x86::Gp & spill)1252     void cmp_arg(x86::Mem mem, const ArgVal &val, const x86::Gp &spill) {
1253         /* Note that the cast to Sint is necessary to handle negative numbers
1254          * such as NIL. */
1255         if (val.isImmed() && Support::isInt32((Sint)val.getValue())) {
1256             a.cmp(mem, imm(val.getValue()));
1257         } else {
1258             mov_arg(spill, val);
1259             a.cmp(mem, spill);
1260         }
1261     }
1262 
cmp_arg(x86::Gp gp,const ArgVal & val,const x86::Gp & spill)1263     void cmp_arg(x86::Gp gp, const ArgVal &val, const x86::Gp &spill) {
1264         if (val.isImmed() && Support::isInt32((Sint)val.getValue())) {
1265             a.cmp(gp, imm(val.getValue()));
1266         } else {
1267             mov_arg(spill, val);
1268             a.cmp(gp, spill);
1269         }
1270     }
1271 
1272     /* Note: May clear flags. */
mov_arg(x86::Gp to,const ArgVal & from,const x86::Gp & spill)1273     void mov_arg(x86::Gp to, const ArgVal &from, const x86::Gp &spill) {
1274         if (from.isMem()) {
1275             a.mov(to, getArgRef(from));
1276         } else if (from.isLiteral()) {
1277             make_move_patch(to, literals[from.getValue()].patches);
1278         } else {
1279             mov_imm(to, from.getValue());
1280         }
1281     }
1282 
mov_arg(x86::Mem to,const ArgVal & from,const x86::Gp & spill)1283     void mov_arg(x86::Mem to, const ArgVal &from, const x86::Gp &spill) {
1284         if (from.isImmed()) {
1285             if (Support::isInt32((Sint)from.getValue())) {
1286                 a.mov(to, imm(from.getValue()));
1287             } else {
1288                 a.mov(spill, imm(from.getValue()));
1289                 a.mov(to, spill);
1290             }
1291         } else {
1292             mov_arg(spill, from);
1293             a.mov(to, spill);
1294         }
1295     }
1296 
mov_arg(const ArgVal & to,x86::Gp from,const x86::Gp & spill)1297     void mov_arg(const ArgVal &to, x86::Gp from, const x86::Gp &spill) {
1298         (void)spill;
1299 
1300         a.mov(getArgRef(to), from);
1301     }
1302 
mov_arg(const ArgVal & to,BeamInstr from,const x86::Gp & spill)1303     void mov_arg(const ArgVal &to, BeamInstr from, const x86::Gp &spill) {
1304         if (Support::isInt32((Sint)from)) {
1305             a.mov(getArgRef(to), imm(from));
1306         } else {
1307             a.mov(spill, imm(from));
1308             mov_arg(to, spill);
1309         }
1310     }
1311 
mov_arg(const ArgVal & to,const ArgVal & from,const x86::Gp & spill)1312     void mov_arg(const ArgVal &to, const ArgVal &from, const x86::Gp &spill) {
1313         if (from.isMem()) {
1314             mov_arg(spill, from);
1315             mov_arg(to, spill);
1316         } else {
1317             mov_arg(getArgRef(to), from);
1318         }
1319     }
1320 };
1321 
1322 void beamasm_update_perf_info(std::string modulename,
1323                               std::vector<BeamAssembler::AsmRange> &ranges);
1324