1 //! x86 ABI implementation.
2 
3 use super::super::settings as shared_settings;
4 use super::registers::{FPR, GPR, RU};
5 use super::settings as isa_settings;
6 use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
7 use crate::cursor::{Cursor, CursorPosition, EncCursor};
8 use crate::ir;
9 use crate::ir::immediates::Imm64;
10 use crate::ir::stackslot::{StackOffset, StackSize};
11 use crate::ir::types;
12 use crate::ir::{
13     get_probestack_funcref, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, InstBuilder,
14     ValueLoc,
15 };
16 use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa};
17 use crate::regalloc::RegisterSet;
18 use crate::result::CodegenResult;
19 use crate::stack_layout::layout_stack;
20 use alloc::borrow::Cow;
21 use core::i32;
22 use target_lexicon::{PointerWidth, Triple};
23 
24 /// Argument registers for x86-64
25 static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9];
26 
27 /// Return value registers.
28 static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx];
29 
30 /// Argument registers for x86-64, when using windows fastcall
31 static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9];
32 
33 /// Return value registers for x86-64, when using windows fastcall
34 static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
35 
36 /// The win64 fastcall ABI uses some shadow stack space, allocated by the caller, that can be used
37 /// by the callee for temporary values.
38 ///
39 /// [1] "Space is allocated on the call stack as a shadow store for callees to save" This shadow
40 /// store contains the parameters which are passed through registers (ARG_GPRS) and is eventually
41 /// used by the callee to save & restore the values of the arguments.
42 ///
43 /// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
44 /// convention reserves spill space for parameters, you don’t have to use them as such"
45 const WIN_SHADOW_STACK_SPACE: StackSize = 32;
46 
47 /// Stack alignment requirement for functions.
48 ///
49 /// 16 bytes is the perfect stack alignment, because:
50 ///
51 /// - On Win64, "The primary exceptions are the stack pointer and malloc or alloca memory, which
52 /// are aligned to 16 bytes in order to aid performance".
53 /// - The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but newer versions use a
54 /// 16-byte aligned stack pointer.
55 /// - This allows using aligned loads and stores on SIMD vectors of 16 bytes that are located
56 /// higher up in the stack.
57 const STACK_ALIGNMENT: u32 = 16;
58 
59 #[derive(Clone)]
60 struct Args {
61     pointer_bytes: u8,
62     pointer_bits: u8,
63     pointer_type: ir::Type,
64     gpr: &'static [RU],
65     gpr_used: usize,
66     fpr_limit: usize,
67     fpr_used: usize,
68     offset: u32,
69     call_conv: CallConv,
70     shared_flags: shared_settings::Flags,
71     #[allow(dead_code)]
72     isa_flags: isa_settings::Flags,
73     assigning_returns: bool,
74 }
75 
76 impl Args {
new( bits: u8, gpr: &'static [RU], fpr_limit: usize, call_conv: CallConv, shared_flags: &shared_settings::Flags, isa_flags: &isa_settings::Flags, assigning_returns: bool, ) -> Self77     fn new(
78         bits: u8,
79         gpr: &'static [RU],
80         fpr_limit: usize,
81         call_conv: CallConv,
82         shared_flags: &shared_settings::Flags,
83         isa_flags: &isa_settings::Flags,
84         assigning_returns: bool,
85     ) -> Self {
86         let offset = if call_conv.extends_windows_fastcall() {
87             WIN_SHADOW_STACK_SPACE
88         } else {
89             0
90         };
91 
92         Self {
93             pointer_bytes: bits / 8,
94             pointer_bits: bits,
95             pointer_type: ir::Type::int(u16::from(bits)).unwrap(),
96             gpr,
97             gpr_used: 0,
98             fpr_limit,
99             fpr_used: 0,
100             offset,
101             call_conv,
102             shared_flags: shared_flags.clone(),
103             isa_flags: isa_flags.clone(),
104             assigning_returns,
105         }
106     }
107 }
108 
109 impl ArgAssigner for Args {
assign(&mut self, arg: &AbiParam) -> ArgAction110     fn assign(&mut self, arg: &AbiParam) -> ArgAction {
111         if let ArgumentPurpose::StructArgument(size) = arg.purpose {
112             if self.call_conv != CallConv::SystemV {
113                 panic!(
114                     "The sarg argument purpose is not yet implemented for non-systemv call conv {:?}",
115                     self.call_conv,
116                 );
117             }
118             let loc = ArgumentLoc::Stack(self.offset as i32);
119             self.offset += size;
120             debug_assert!(self.offset <= i32::MAX as u32);
121             return ArgAction::AssignAndChangeType(loc, types::SARG_T);
122         }
123 
124         let ty = arg.value_type;
125 
126         if ty.bits() > u16::from(self.pointer_bits) {
127             if !self.assigning_returns && self.call_conv.extends_windows_fastcall() {
128                 // "Any argument that doesn't fit in 8 bytes, or isn't
129                 // 1, 2, 4, or 8 bytes, must be passed by reference"
130                 return ValueConversion::Pointer(self.pointer_type).into();
131             } else if !ty.is_vector() && !ty.is_float() {
132                 // On SystemV large integers and booleans are broken down to fit in a register.
133                 return ValueConversion::IntSplit.into();
134             }
135         }
136 
137         // Vectors should stay in vector registers unless SIMD is not enabled--then they are split
138         if ty.is_vector() {
139             if self.shared_flags.enable_simd() {
140                 let reg = FPR.unit(self.fpr_used);
141                 self.fpr_used += 1;
142                 return ArgumentLoc::Reg(reg).into();
143             }
144             return ValueConversion::VectorSplit.into();
145         }
146 
147         // Small integers are extended to the size of a pointer register, but
148         // only in ABIs that require this. The Baldrdash (SpiderMonkey) ABI
149         // does, but our other supported ABIs on x86 do not.
150         if ty.is_int()
151             && ty.bits() < u16::from(self.pointer_bits)
152             && self.call_conv.extends_baldrdash()
153         {
154             match arg.extension {
155                 ArgumentExtension::None => {}
156                 ArgumentExtension::Uext => return ValueConversion::Uext(self.pointer_type).into(),
157                 ArgumentExtension::Sext => return ValueConversion::Sext(self.pointer_type).into(),
158             }
159         }
160 
161         // Handle special-purpose arguments.
162         if ty.is_int() && self.call_conv.extends_baldrdash() {
163             match arg.purpose {
164                 // This is SpiderMonkey's `WasmTlsReg`.
165                 ArgumentPurpose::VMContext => {
166                     return ArgumentLoc::Reg(if self.pointer_bits == 64 {
167                         RU::r14
168                     } else {
169                         RU::rsi
170                     } as RegUnit)
171                     .into();
172                 }
173                 // This is SpiderMonkey's `WasmTableCallSigReg`.
174                 ArgumentPurpose::SignatureId => {
175                     return ArgumentLoc::Reg(if self.pointer_bits == 64 {
176                         RU::r10
177                     } else {
178                         RU::rcx
179                     } as RegUnit)
180                     .into()
181                 }
182                 _ => {}
183             }
184         }
185 
186         // Try to use a GPR.
187         if !ty.is_float() && self.gpr_used < self.gpr.len() {
188             let reg = self.gpr[self.gpr_used] as RegUnit;
189             self.gpr_used += 1;
190             return ArgumentLoc::Reg(reg).into();
191         }
192 
193         // Try to use an FPR.
194         let fpr_offset = if self.call_conv.extends_windows_fastcall() {
195             // Float and general registers on windows share the same parameter index.
196             // The used register depends entirely on the parameter index: Even if XMM0
197             // is not used for the first parameter, it cannot be used for the second parameter.
198             debug_assert_eq!(self.fpr_limit, self.gpr.len());
199             &mut self.gpr_used
200         } else {
201             &mut self.fpr_used
202         };
203 
204         if ty.is_float() && *fpr_offset < self.fpr_limit {
205             let reg = FPR.unit(*fpr_offset);
206             *fpr_offset += 1;
207             return ArgumentLoc::Reg(reg).into();
208         }
209 
210         // Assign a stack location.
211         let loc = ArgumentLoc::Stack(self.offset as i32);
212         self.offset += u32::from(self.pointer_bytes);
213         debug_assert!(self.offset <= i32::MAX as u32);
214         loc.into()
215     }
216 }
217 
218 /// Legalize `sig`.
legalize_signature( sig: &mut Cow<ir::Signature>, triple: &Triple, _current: bool, shared_flags: &shared_settings::Flags, isa_flags: &isa_settings::Flags, )219 pub fn legalize_signature(
220     sig: &mut Cow<ir::Signature>,
221     triple: &Triple,
222     _current: bool,
223     shared_flags: &shared_settings::Flags,
224     isa_flags: &isa_settings::Flags,
225 ) {
226     let bits;
227     let mut args;
228 
229     match triple.pointer_width().unwrap() {
230         PointerWidth::U16 => panic!(),
231         PointerWidth::U32 => {
232             bits = 32;
233             args = Args::new(bits, &[], 0, sig.call_conv, shared_flags, isa_flags, false);
234         }
235         PointerWidth::U64 => {
236             bits = 64;
237             args = if sig.call_conv.extends_windows_fastcall() {
238                 Args::new(
239                     bits,
240                     &ARG_GPRS_WIN_FASTCALL_X64[..],
241                     4,
242                     sig.call_conv,
243                     shared_flags,
244                     isa_flags,
245                     false,
246                 )
247             } else {
248                 Args::new(
249                     bits,
250                     &ARG_GPRS[..],
251                     8,
252                     sig.call_conv,
253                     shared_flags,
254                     isa_flags,
255                     false,
256                 )
257             };
258         }
259     }
260 
261     let (ret_regs, ret_fpr_limit) = if sig.call_conv.extends_windows_fastcall() {
262         // windows-x64 calling convention only uses XMM0 or RAX for return values
263         (&RET_GPRS_WIN_FASTCALL_X64[..], 1)
264     } else {
265         (&RET_GPRS[..], 2)
266     };
267 
268     let mut rets = Args::new(
269         bits,
270         ret_regs,
271         ret_fpr_limit,
272         sig.call_conv,
273         shared_flags,
274         isa_flags,
275         true,
276     );
277 
278     // If we don't have enough available return registers
279     // to fit all of the return values, we need to backtrack and start
280     // assigning locations all over again with a different strategy. In order to
281     // do that, we need a copy of the original assigner for the returns.
282     let mut backup_rets = rets.clone();
283 
284     if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
285         if new_returns
286             .iter()
287             .filter(|r| r.purpose == ArgumentPurpose::Normal)
288             .any(|r| !r.location.is_reg())
289         {
290             // The return values couldn't all fit into available return
291             // registers. Introduce the use of a struct-return parameter.
292             debug_assert!(!sig.uses_struct_return_param());
293 
294             // We're using the first register for the return pointer parameter.
295             let mut ret_ptr_param = AbiParam {
296                 value_type: args.pointer_type,
297                 purpose: ArgumentPurpose::StructReturn,
298                 extension: ArgumentExtension::None,
299                 location: ArgumentLoc::Unassigned,
300                 legalized_to_pointer: false,
301             };
302             match args.assign(&ret_ptr_param) {
303                 ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
304                     ret_ptr_param.location = ArgumentLoc::Reg(reg);
305                     sig.to_mut().params.push(ret_ptr_param);
306                 }
307                 _ => unreachable!("return pointer should always get a register assignment"),
308             }
309 
310             // We're using the first return register for the return pointer (like
311             // sys v does).
312             let mut ret_ptr_return = AbiParam {
313                 value_type: args.pointer_type,
314                 purpose: ArgumentPurpose::StructReturn,
315                 extension: ArgumentExtension::None,
316                 location: ArgumentLoc::Unassigned,
317                 legalized_to_pointer: false,
318             };
319             match backup_rets.assign(&ret_ptr_return) {
320                 ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
321                     ret_ptr_return.location = ArgumentLoc::Reg(reg);
322                     sig.to_mut().returns.push(ret_ptr_return);
323                 }
324                 _ => unreachable!("return pointer should always get a register assignment"),
325             }
326 
327             sig.to_mut().returns.retain(|ret| {
328                 // Either this is the return pointer, in which case we want to keep
329                 // it, or else assume that it is assigned for a reason and doesn't
330                 // conflict with our return pointering legalization.
331                 debug_assert_eq!(
332                     ret.location.is_assigned(),
333                     ret.purpose != ArgumentPurpose::Normal
334                 );
335                 ret.location.is_assigned()
336             });
337 
338             if let Some(new_returns) = legalize_args(&sig.returns, &mut backup_rets) {
339                 sig.to_mut().returns = new_returns;
340             }
341         } else {
342             sig.to_mut().returns = new_returns;
343         }
344     }
345 
346     if let Some(new_params) = legalize_args(&sig.params, &mut args) {
347         sig.to_mut().params = new_params;
348     }
349 }
350 
351 /// Get register class for a type appearing in a legalized signature.
regclass_for_abi_type(ty: ir::Type) -> RegClass352 pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass {
353     if ty.is_int() || ty.is_bool() || ty.is_ref() {
354         GPR
355     } else {
356         FPR
357     }
358 }
359 
360 /// Get the set of allocatable registers for `func`.
allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> RegisterSet361 pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> RegisterSet {
362     let mut regs = RegisterSet::new();
363     regs.take(GPR, RU::rsp as RegUnit);
364     regs.take(GPR, RU::rbp as RegUnit);
365 
366     // 32-bit arch only has 8 registers.
367     if triple.pointer_width().unwrap() != PointerWidth::U64 {
368         for i in 8..16 {
369             regs.take(GPR, GPR.unit(i));
370             regs.take(FPR, FPR.unit(i));
371         }
372         if flags.enable_pinned_reg() {
373             unimplemented!("Pinned register not implemented on x86-32.");
374         }
375     } else {
376         // Choose r15 as the pinned register on 64-bits: it is non-volatile on native ABIs and
377         // isn't the fixed output register of any instruction.
378         if flags.enable_pinned_reg() {
379             regs.take(GPR, RU::r15 as RegUnit);
380         }
381     }
382 
383     regs
384 }
385 
386 /// Get the set of callee-saved general-purpose registers.
callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU]387 fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] {
388     match isa.triple().pointer_width().unwrap() {
389         PointerWidth::U16 => panic!(),
390         PointerWidth::U32 => &[RU::rbx, RU::rsi, RU::rdi],
391         PointerWidth::U64 => {
392             if call_conv.extends_windows_fastcall() {
393                 // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 are
394                 // considered nonvolatile and must be saved and restored by a function that uses
395                 //  them."
396                 // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
397                 // RSP & RBP are not listed below, since they are restored automatically during
398                 // a function call. If that wasn't the case, function calls (RET) would not work.
399                 &[
400                     RU::rbx,
401                     RU::rdi,
402                     RU::rsi,
403                     RU::r12,
404                     RU::r13,
405                     RU::r14,
406                     RU::r15,
407                 ]
408             } else {
409                 &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15]
410             }
411         }
412     }
413 }
414 
415 /// Get the set of callee-saved floating-point (SIMD) registers.
callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU]416 fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] {
417     match isa.triple().pointer_width().unwrap() {
418         PointerWidth::U16 => panic!(),
419         PointerWidth::U32 => &[],
420         PointerWidth::U64 => {
421             if call_conv.extends_windows_fastcall() {
422                 // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved
423                 //  and restored by a function that uses them."
424                 // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of
425                 // February 5th, 2020.
426                 &[
427                     RU::xmm6,
428                     RU::xmm7,
429                     RU::xmm8,
430                     RU::xmm9,
431                     RU::xmm10,
432                     RU::xmm11,
433                     RU::xmm12,
434                     RU::xmm13,
435                     RU::xmm14,
436                     RU::xmm15,
437                 ]
438             } else {
439                 &[]
440             }
441         }
442     }
443 }
444 
445 /// Get the set of callee-saved registers that are used.
callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet446 fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet {
447     let mut all_callee_saved = RegisterSet::empty();
448     for reg in callee_saved_gprs(isa, func.signature.call_conv) {
449         all_callee_saved.free(GPR, *reg as RegUnit);
450     }
451     for reg in callee_saved_fprs(isa, func.signature.call_conv) {
452         all_callee_saved.free(FPR, *reg as RegUnit);
453     }
454 
455     let mut used = RegisterSet::empty();
456     for value_loc in func.locations.values() {
457         // Note that `value_loc` here contains only a single unit of a potentially multi-unit
458         // register. We don't use registers that overlap each other in the x86 ISA, but in others
459         // we do. So this should not be blindly reused.
460         if let ValueLoc::Reg(ru) = *value_loc {
461             if GPR.contains(ru) {
462                 if !used.is_avail(GPR, ru) {
463                     used.free(GPR, ru);
464                 }
465             } else if FPR.contains(ru) {
466                 if !used.is_avail(FPR, ru) {
467                     used.free(FPR, ru);
468                 }
469             }
470         }
471     }
472 
473     // regmove and regfill instructions may temporarily divert values into other registers,
474     // and these are not reflected in `func.locations`. Scan the function for such instructions
475     // and note which callee-saved registers they use.
476     //
477     // TODO: Consider re-evaluating how regmove/regfill/regspill work and whether it's possible
478     // to avoid this step.
479     for block in &func.layout {
480         for inst in func.layout.block_insts(block) {
481             match func.dfg[inst] {
482                 ir::instructions::InstructionData::RegMove { dst, .. }
483                 | ir::instructions::InstructionData::RegFill { dst, .. } => {
484                     if GPR.contains(dst) {
485                         if !used.is_avail(GPR, dst) {
486                             used.free(GPR, dst);
487                         }
488                     } else if FPR.contains(dst) {
489                         if !used.is_avail(FPR, dst) {
490                             used.free(FPR, dst);
491                         }
492                     }
493                 }
494                 _ => (),
495             }
496         }
497     }
498 
499     used.intersect(&all_callee_saved);
500     used
501 }
502 
prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()>503 pub fn prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
504     match func.signature.call_conv {
505         // For now, just translate fast and cold as system_v.
506         CallConv::Fast | CallConv::Cold | CallConv::SystemV | CallConv::WasmtimeSystemV => {
507             system_v_prologue_epilogue(func, isa)
508         }
509         CallConv::WindowsFastcall | CallConv::WasmtimeFastcall => {
510             fastcall_prologue_epilogue(func, isa)
511         }
512         CallConv::BaldrdashSystemV | CallConv::BaldrdashWindows => {
513             baldrdash_prologue_epilogue(func, isa)
514         }
515         CallConv::Probestack => unimplemented!("probestack calling convention"),
516         CallConv::Baldrdash2020 => unimplemented!("Baldrdash ABI 2020"),
517         CallConv::AppleAarch64 => unreachable!(),
518     }
519 }
520 
baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()>521 fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
522     debug_assert!(
523         !isa.flags().enable_probestack(),
524         "baldrdash does not expect cranelift to emit stack probes"
525     );
526 
527     let word_size = StackSize::from(isa.pointer_bytes());
528     let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
529         WIN_SHADOW_STACK_SPACE
530     } else {
531         0
532     };
533 
534     let bytes =
535         StackSize::from(isa.flags().baldrdash_prologue_words()) * word_size + shadow_store_size;
536 
537     let mut ss = ir::StackSlotData::new(ir::StackSlotKind::IncomingArg, bytes);
538     ss.offset = Some(-(bytes as StackOffset));
539     func.stack_slots.push(ss);
540 
541     let is_leaf = func.is_leaf();
542     layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)?;
543     Ok(())
544 }
545 
546 /// Implementation of the fastcall-based Win64 calling convention described at [1]
547 /// [1] https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()>548 fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
549     if isa.triple().pointer_width().unwrap() != PointerWidth::U64 {
550         panic!("TODO: windows-fastcall: x86-32 not implemented yet");
551     }
552 
553     // The reserved stack area is composed of:
554     //   return address + frame pointer + all callee-saved registers
555     //
556     // Pushing the return address is an implicit function of the `call`
557     // instruction. Each of the others we will then push explicitly. Then we
558     // will adjust the stack pointer to make room for the rest of the required
559     // space for this frame.
560     let csrs = callee_saved_regs_used(isa, func);
561     let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
562     let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
563     let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
564 
565     // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
566     if fpsr_stack_size > 0 {
567         csr_stack_size = (csr_stack_size + 15) & !15;
568     }
569 
570     func.create_stack_slot(ir::StackSlotData {
571         kind: ir::StackSlotKind::IncomingArg,
572         size: csr_stack_size,
573         offset: Some(-(csr_stack_size as StackOffset)),
574     });
575 
576     let is_leaf = func.is_leaf();
577 
578     // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
579     if !is_leaf {
580         // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
581         func.create_stack_slot(ir::StackSlotData {
582             kind: ir::StackSlotKind::ExplicitSlot,
583             size: WIN_SHADOW_STACK_SPACE,
584             offset: None,
585         });
586     }
587 
588     let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
589 
590     // Subtract the GPR saved register size from the local size because pushes are used for the saves
591     let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
592 
593     // Add CSRs to function signature
594     let reg_type = isa.pointer_type();
595     let sp_arg_index = if fpsr_stack_size > 0 {
596         let sp_arg = ir::AbiParam::special_reg(
597             reg_type,
598             ir::ArgumentPurpose::CalleeSaved,
599             RU::rsp as RegUnit,
600         );
601         let index = func.signature.params.len();
602         func.signature.params.push(sp_arg);
603         Some(index)
604     } else {
605         None
606     };
607     let fp_arg = ir::AbiParam::special_reg(
608         reg_type,
609         ir::ArgumentPurpose::FramePointer,
610         RU::rbp as RegUnit,
611     );
612     func.signature.params.push(fp_arg);
613     func.signature.returns.push(fp_arg);
614 
615     for gp_csr in csrs.iter(GPR) {
616         let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, gp_csr);
617         func.signature.params.push(csr_arg);
618         func.signature.returns.push(csr_arg);
619     }
620 
621     for fp_csr in csrs.iter(FPR) {
622         // The calling convention described in
623         // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires
624         // preserving the low 128 bits of XMM6-XMM15.
625         let csr_arg =
626             ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr);
627         func.signature.params.push(csr_arg);
628         func.signature.returns.push(csr_arg);
629     }
630 
631     // Set up the cursor and insert the prologue
632     let entry_block = func.layout.entry_block().expect("missing entry block");
633     let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
634     insert_common_prologue(
635         &mut pos,
636         local_stack_size,
637         reg_type,
638         &csrs,
639         sp_arg_index.is_some(),
640         isa,
641     );
642 
643     // Reset the cursor and insert the epilogue
644     let mut pos = pos.at_position(CursorPosition::Nowhere);
645     insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
646 
647     Ok(())
648 }
649 
650 /// Insert a System V-compatible prologue and epilogue.
system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()>651 fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
652     let pointer_width = isa.triple().pointer_width().unwrap();
653     let word_size = pointer_width.bytes() as usize;
654 
655     let csrs = callee_saved_regs_used(isa, func);
656     assert!(
657         csrs.iter(FPR).len() == 0,
658         "SysV ABI does not have callee-save SIMD registers"
659     );
660 
661     // The reserved stack area is composed of:
662     //   return address + frame pointer + all callee-saved registers
663     //
664     // Pushing the return address is an implicit function of the `call`
665     // instruction. Each of the others we will then push explicitly. Then we
666     // will adjust the stack pointer to make room for the rest of the required
667     // space for this frame.
668     let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
669     func.create_stack_slot(ir::StackSlotData {
670         kind: ir::StackSlotKind::IncomingArg,
671         size: csr_stack_size as u32,
672         offset: Some(-csr_stack_size),
673     });
674 
675     let is_leaf = func.is_leaf();
676     let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
677     let local_stack_size = i64::from(total_stack_size - csr_stack_size);
678 
679     // Add CSRs to function signature
680     let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
681     // On X86-32 all parameters, including vmctx, are passed on stack, and we need
682     // to extract vmctx from the stack before we can save the frame pointer.
683     let sp_arg_index = if isa.pointer_bits() == 32 {
684         let sp_arg = ir::AbiParam::special_reg(
685             reg_type,
686             ir::ArgumentPurpose::CalleeSaved,
687             RU::rsp as RegUnit,
688         );
689         let index = func.signature.params.len();
690         func.signature.params.push(sp_arg);
691         Some(index)
692     } else {
693         None
694     };
695     let fp_arg = ir::AbiParam::special_reg(
696         reg_type,
697         ir::ArgumentPurpose::FramePointer,
698         RU::rbp as RegUnit,
699     );
700     func.signature.params.push(fp_arg);
701     func.signature.returns.push(fp_arg);
702 
703     for csr in csrs.iter(GPR) {
704         let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr);
705         func.signature.params.push(csr_arg);
706         func.signature.returns.push(csr_arg);
707     }
708 
709     // Set up the cursor and insert the prologue
710     let entry_block = func.layout.entry_block().expect("missing entry block");
711     let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
712     insert_common_prologue(
713         &mut pos,
714         local_stack_size,
715         reg_type,
716         &csrs,
717         sp_arg_index.is_some(),
718         isa,
719     );
720 
721     // Reset the cursor and insert the epilogue
722     let mut pos = pos.at_position(CursorPosition::Nowhere);
723     insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
724 
725     Ok(())
726 }
727 
728 /// Insert the prologue for a given function.
729 /// This is used by common calling conventions such as System V.
insert_common_prologue( pos: &mut EncCursor, stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, has_sp_param: bool, isa: &dyn TargetIsa, )730 fn insert_common_prologue(
731     pos: &mut EncCursor,
732     stack_size: i64,
733     reg_type: ir::types::Type,
734     csrs: &RegisterSet,
735     has_sp_param: bool,
736     isa: &dyn TargetIsa,
737 ) {
738     let sp = if has_sp_param {
739         let block = pos.current_block().expect("missing block under cursor");
740         let sp = pos.func.dfg.append_block_param(block, reg_type);
741         pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
742         Some(sp)
743     } else {
744         None
745     };
746 
747     // If this is a leaf function with zero stack, then there's no need to
748     // insert a stack check since it can't overflow anything and
749     // forward-progress is guarantee so long as loop are handled anyway.
750     //
751     // If this has a stack size it could stack overflow, or if it isn't a leaf
752     // it could be part of a long call chain which we need to check anyway.
753     //
754     // First we look for the stack limit as a special argument to the function,
755     // and failing that we see if a custom stack limit factory has been provided
756     // which will be used to likely calculate the stack limit from the arguments
757     // or perhaps constants.
758     if stack_size > 0 || !pos.func.is_leaf() {
759         let scratch = ir::ValueLoc::Reg(RU::rax as RegUnit);
760         let stack_limit_arg = match pos.func.special_param(ArgumentPurpose::StackLimit) {
761             Some(arg) => {
762                 let copy = pos.ins().copy(arg);
763                 pos.func.locations[copy] = scratch;
764                 Some(copy)
765             }
766             None => pos
767                 .func
768                 .stack_limit
769                 .map(|gv| interpret_gv(pos, gv, sp, scratch)),
770         };
771         if let Some(stack_limit_arg) = stack_limit_arg {
772             insert_stack_check(pos, stack_size, stack_limit_arg);
773         }
774     }
775 
776     // Append param to entry block
777     let block = pos.current_block().expect("missing block under cursor");
778     let fp = pos.func.dfg.append_block_param(block, reg_type);
779     pos.func.locations[fp] = ir::ValueLoc::Reg(RU::rbp as RegUnit);
780 
781     pos.ins().x86_push(fp);
782 
783     let mov_sp_inst = pos
784         .ins()
785         .copy_special(RU::rsp as RegUnit, RU::rbp as RegUnit);
786 
787     let mut last_csr_push = None;
788     for reg in csrs.iter(GPR) {
789         // Append param to entry block
790         let csr_arg = pos.func.dfg.append_block_param(block, reg_type);
791 
792         // Assign it a location
793         pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
794         last_csr_push = Some(pos.ins().x86_push(csr_arg));
795     }
796 
797     // Allocate stack frame storage.
798     let mut adjust_sp_inst = None;
799     if stack_size > 0 {
800         if isa.flags().enable_probestack() && stack_size > (1 << isa.flags().probestack_size_log2())
801         {
802             // Emit a stack probe.
803             let rax = RU::rax as RegUnit;
804             let rax_val = ir::ValueLoc::Reg(rax);
805 
806             // The probestack function expects its input in %rax.
807             let arg = pos.ins().iconst(reg_type, stack_size);
808             pos.func.locations[arg] = rax_val;
809 
810             // Call the probestack function.
811             let callee = get_probestack_funcref(pos.func, reg_type, rax, isa);
812 
813             // Make the call.
814             let call = if !isa.flags().is_pic()
815                 && isa.triple().pointer_width().unwrap() == PointerWidth::U64
816                 && !pos.func.dfg.ext_funcs[callee].colocated
817             {
818                 // 64-bit non-PIC non-colocated calls need to be legalized to call_indirect.
819                 // Use r11 as it may be clobbered under all supported calling conventions.
820                 let r11 = RU::r11 as RegUnit;
821                 let sig = pos.func.dfg.ext_funcs[callee].signature;
822                 let addr = pos.ins().func_addr(reg_type, callee);
823                 pos.func.locations[addr] = ir::ValueLoc::Reg(r11);
824                 pos.ins().call_indirect(sig, addr, &[arg])
825             } else {
826                 // Otherwise just do a normal call.
827                 pos.ins().call(callee, &[arg])
828             };
829 
830             // If the probestack function doesn't adjust sp, do it ourselves.
831             if !isa.flags().probestack_func_adjusts_sp() {
832                 let result = pos.func.dfg.inst_results(call)[0];
833                 pos.func.locations[result] = rax_val;
834                 adjust_sp_inst = Some(pos.ins().adjust_sp_down(result));
835             }
836         } else {
837             // Simply decrement the stack pointer.
838             adjust_sp_inst = Some(pos.ins().adjust_sp_down_imm(Imm64::new(stack_size)));
839         }
840     }
841 
842     // With the stack pointer adjusted, save any callee-saved floating point registers via offset
843     // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
844     let mut last_fpr_save = None;
845 
846     for (i, reg) in csrs.iter(FPR).enumerate() {
847         // Append param to entry block
848         let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
849 
850         // Since regalloc has already run, we must assign a location.
851         pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
852 
853         // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
854         let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
855             + (stack_size % types::F64X2.bytes() as i64);
856 
857         last_fpr_save = Some(pos.ins().store(
858             ir::MemFlags::trusted(),
859             csr_arg,
860             sp.expect("FPR save requires SP param"),
861             (stack_size - offset) as i32,
862         ));
863     }
864 
865     pos.func.prologue_end = Some(
866         last_fpr_save
867             .or(adjust_sp_inst)
868             .or(last_csr_push)
869             .unwrap_or(mov_sp_inst),
870     );
871 }
872 
873 /// Inserts code necessary to calculate `gv`.
874 ///
875 /// Note that this is typically done with `ins().global_value(...)` but that
876 /// requires legalization to run to encode it, and we're running super late
877 /// here in the backend where legalization isn't possible. To get around this
878 /// we manually interpret the `gv` specified and do register allocation for
879 /// intermediate values.
880 ///
881 /// This is an incomplete implementation of loading `GlobalValue` values to get
882 /// compared to the stack pointer, but currently it serves enough functionality
883 /// to get this implemented in `wasmtime` itself. This'll likely get expanded a
884 /// bit over time!
interpret_gv( pos: &mut EncCursor, gv: ir::GlobalValue, sp: Option<ir::Value>, scratch: ir::ValueLoc, ) -> ir::Value885 fn interpret_gv(
886     pos: &mut EncCursor,
887     gv: ir::GlobalValue,
888     sp: Option<ir::Value>,
889     scratch: ir::ValueLoc,
890 ) -> ir::Value {
891     match pos.func.global_values[gv] {
892         ir::GlobalValueData::VMContext => {
893             let vmctx_index = pos
894                 .func
895                 .signature
896                 .special_param_index(ir::ArgumentPurpose::VMContext)
897                 .expect("no vmcontext parameter found");
898             match pos.func.signature.params[vmctx_index] {
899                 AbiParam {
900                     location: ArgumentLoc::Reg(_),
901                     ..
902                 } => {
903                     let entry = pos.func.layout.entry_block().unwrap();
904                     pos.func.dfg.block_params(entry)[vmctx_index]
905                 }
906                 AbiParam {
907                     location: ArgumentLoc::Stack(offset),
908                     value_type,
909                     ..
910                 } => {
911                     let offset =
912                         offset + i32::from(pos.isa.pointer_bytes() * (1 + vmctx_index as u8));
913                     // The following access can be marked `trusted` because it is a load of an argument. We
914                     // know it is safe because it was safe to write it in preparing this function call.
915                     let ret =
916                         pos.ins()
917                             .load(value_type, ir::MemFlags::trusted(), sp.unwrap(), offset);
918                     pos.func.locations[ret] = scratch;
919                     return ret;
920                 }
921                 AbiParam {
922                     location: ArgumentLoc::Unassigned,
923                     ..
924                 } => unreachable!(),
925             }
926         }
927         ir::GlobalValueData::Load {
928             base,
929             offset,
930             global_type,
931             readonly: _,
932         } => {
933             let base = interpret_gv(pos, base, sp, scratch);
934             let ret = pos
935                 .ins()
936                 .load(global_type, ir::MemFlags::trusted(), base, offset);
937             pos.func.locations[ret] = scratch;
938             return ret;
939         }
940         ref other => panic!("global value for stack limit not supported: {}", other),
941     }
942 }
943 
944 /// Insert a check that generates a trap if the stack pointer goes
945 /// below a value in `stack_limit_arg`.
insert_stack_check(pos: &mut EncCursor, stack_size: i64, stack_limit_arg: ir::Value)946 fn insert_stack_check(pos: &mut EncCursor, stack_size: i64, stack_limit_arg: ir::Value) {
947     use crate::ir::condcodes::IntCC;
948 
949     // Our stack pointer, after subtracting `stack_size`, must not be below
950     // `stack_limit_arg`. To do this we're going to add `stack_size` to
951     // `stack_limit_arg` and see if the stack pointer is below that. The
952     // `stack_size + stack_limit_arg` computation might overflow, however, due
953     // to how stack limits may be loaded and set externally to trigger a trap.
954     //
955     // To handle this we'll need an extra comparison to see if the stack
956     // pointer is already below `stack_limit_arg`. Most of the time this
957     // isn't necessary though since the stack limit which triggers a trap is
958     // likely a sentinel somewhere around `usize::max_value()`. In that case
959     // only conditionally emit this pre-flight check. That way most functions
960     // only have the one comparison, but are also guaranteed that if we add
961     // `stack_size` to `stack_limit_arg` is won't overflow.
962     //
963     // This does mean that code generators which use this stack check
964     // functionality need to ensure that values stored into the stack limit
965     // will never overflow if this threshold is added.
966     if stack_size >= 32 * 1024 {
967         let cflags = pos.ins().ifcmp_sp(stack_limit_arg);
968         pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit);
969         pos.ins().trapif(
970             IntCC::UnsignedGreaterThanOrEqual,
971             cflags,
972             ir::TrapCode::StackOverflow,
973         );
974     }
975 
976     // Copy `stack_limit_arg` into a %rax and use it for calculating
977     // a SP threshold.
978     let sp_threshold = pos.ins().iadd_imm(stack_limit_arg, stack_size);
979     pos.func.locations[sp_threshold] = ir::ValueLoc::Reg(RU::rax as RegUnit);
980 
981     // If the stack pointer currently reaches the SP threshold or below it then after opening
982     // the current stack frame, the current stack pointer will reach the limit.
983     let cflags = pos.ins().ifcmp_sp(sp_threshold);
984     pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit);
985     pos.ins().trapif(
986         IntCC::UnsignedGreaterThanOrEqual,
987         cflags,
988         ir::TrapCode::StackOverflow,
989     );
990 }
991 
992 /// Find all `return` instructions and insert epilogues before them.
insert_common_epilogues( pos: &mut EncCursor, stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, sp_arg_index: Option<usize>, )993 fn insert_common_epilogues(
994     pos: &mut EncCursor,
995     stack_size: i64,
996     reg_type: ir::types::Type,
997     csrs: &RegisterSet,
998     sp_arg_index: Option<usize>,
999 ) {
1000     while let Some(block) = pos.next_block() {
1001         pos.goto_last_inst(block);
1002         if let Some(inst) = pos.current_inst() {
1003             if pos.func.dfg[inst].opcode().is_return() {
1004                 insert_common_epilogue(inst, block, stack_size, pos, reg_type, csrs, sp_arg_index);
1005             }
1006         }
1007     }
1008 }
1009 
1010 /// Insert an epilogue given a specific `return` instruction.
1011 /// This is used by common calling conventions such as System V.
insert_common_epilogue( inst: ir::Inst, block: ir::Block, stack_size: i64, pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, sp_arg_index: Option<usize>, )1012 fn insert_common_epilogue(
1013     inst: ir::Inst,
1014     block: ir::Block,
1015     stack_size: i64,
1016     pos: &mut EncCursor,
1017     reg_type: ir::types::Type,
1018     csrs: &RegisterSet,
1019     sp_arg_index: Option<usize>,
1020 ) {
1021     // Insert the pop of the frame pointer
1022     let fp_pop = pos.ins().x86_pop(reg_type);
1023     let fp_pop_inst = pos.prev_inst().unwrap();
1024     pos.func.locations[fp_pop] = ir::ValueLoc::Reg(RU::rbp as RegUnit);
1025     pos.func.dfg.append_inst_arg(inst, fp_pop);
1026 
1027     // Insert the CSR pops
1028     let mut first_csr_pop_inst = None;
1029     for reg in csrs.iter(GPR) {
1030         let csr_pop = pos.ins().x86_pop(reg_type);
1031         first_csr_pop_inst = pos.prev_inst();
1032         assert!(first_csr_pop_inst.is_some());
1033         pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
1034         pos.func.dfg.append_inst_arg(inst, csr_pop);
1035     }
1036 
1037     // Insert the adjustment of SP
1038     let mut sp_adjust_inst = None;
1039     if stack_size > 0 {
1040         pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
1041         sp_adjust_inst = pos.prev_inst();
1042         assert!(sp_adjust_inst.is_some());
1043     }
1044 
1045     let mut first_fpr_load = None;
1046     if let Some(index) = sp_arg_index {
1047         let sp = pos
1048             .func
1049             .dfg
1050             .block_params(pos.func.layout.entry_block().unwrap())[index];
1051 
1052         // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
1053         for (i, reg) in csrs.iter(FPR).enumerate() {
1054             // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
1055             let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
1056                 + (stack_size % types::F64X2.bytes() as i64);
1057 
1058             let value = pos.ins().load(
1059                 types::F64X2,
1060                 ir::MemFlags::trusted(),
1061                 sp,
1062                 (stack_size - offset) as i32,
1063             );
1064 
1065             first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
1066 
1067             pos.func.locations[value] = ir::ValueLoc::Reg(reg);
1068             pos.func.dfg.append_inst_arg(inst, value);
1069         }
1070     } else {
1071         assert!(csrs.iter(FPR).len() == 0);
1072     }
1073 
1074     pos.func.epilogues_start.push((
1075         first_fpr_load
1076             .or(sp_adjust_inst)
1077             .or(first_csr_pop_inst)
1078             .unwrap_or(fp_pop_inst),
1079         block,
1080     ));
1081 }
1082 
1083 #[cfg(feature = "unwind")]
create_unwind_info( func: &ir::Function, isa: &dyn TargetIsa, ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>>1084 pub fn create_unwind_info(
1085     func: &ir::Function,
1086     isa: &dyn TargetIsa,
1087 ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
1088     use crate::isa::unwind::UnwindInfo;
1089     use crate::machinst::UnwindInfoKind;
1090 
1091     // Assumption: RBP is being used as the frame pointer for both calling conventions
1092     // In the future, we should be omitting frame pointer as an optimization, so this will change
1093     Ok(match isa.unwind_info_kind() {
1094         UnwindInfoKind::SystemV => {
1095             super::unwind::systemv::create_unwind_info(func, isa)?.map(|u| UnwindInfo::SystemV(u))
1096         }
1097         UnwindInfoKind::Windows => {
1098             super::unwind::winx64::create_unwind_info(func, isa)?.map(|u| UnwindInfo::WindowsX64(u))
1099         }
1100         UnwindInfoKind::None => None,
1101     })
1102 }
1103