1 //! Lowering rules for AArch64.
2 //!
3 //! TODO: opportunities for better code generation:
4 //!
5 //! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize
6 //!   pre/post-index opportunities.
7 //!
8 //! - Floating-point immediates (FIMM instruction).
9 
10 use crate::ir::condcodes::{FloatCC, IntCC};
11 use crate::ir::types::*;
12 use crate::ir::Inst as IRInst;
13 use crate::ir::{Opcode, Type};
14 use crate::machinst::lower::*;
15 use crate::machinst::*;
16 use crate::CodegenResult;
17 
18 use crate::isa::aarch64::inst::*;
19 use crate::isa::aarch64::AArch64Backend;
20 
21 use super::lower_inst;
22 
23 use crate::data_value::DataValue;
24 use regalloc::{Reg, Writable};
25 use smallvec::SmallVec;
26 use std::cmp;
27 
28 //============================================================================
29 // Result enum types.
30 //
31 // Lowering of a given value results in one of these enums, depending on the
32 // modes in which we can accept the value.
33 
34 /// A lowering result: register, register-shift.  An SSA value can always be
35 /// lowered into one of these options; the register form is the fallback.
36 #[derive(Clone, Debug)]
37 enum ResultRS {
38     Reg(Reg),
39     RegShift(Reg, ShiftOpAndAmt),
40 }
41 
42 /// A lowering result: register, register-shift, register-extend.  An SSA value can always be
43 /// lowered into one of these options; the register form is the fallback.
44 #[derive(Clone, Debug)]
45 enum ResultRSE {
46     Reg(Reg),
47     RegShift(Reg, ShiftOpAndAmt),
48     RegExtend(Reg, ExtendOp),
49 }
50 
51 impl ResultRSE {
from_rs(rs: ResultRS) -> ResultRSE52     fn from_rs(rs: ResultRS) -> ResultRSE {
53         match rs {
54             ResultRS::Reg(r) => ResultRSE::Reg(r),
55             ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s),
56         }
57     }
58 }
59 
60 /// A lowering result: register, register-shift, register-extend, or 12-bit immediate form.
61 /// An SSA value can always be lowered into one of these options; the register form is the
62 /// fallback.
63 #[derive(Clone, Debug)]
64 pub(crate) enum ResultRSEImm12 {
65     Reg(Reg),
66     RegShift(Reg, ShiftOpAndAmt),
67     RegExtend(Reg, ExtendOp),
68     Imm12(Imm12),
69 }
70 
71 impl ResultRSEImm12 {
from_rse(rse: ResultRSE) -> ResultRSEImm1272     fn from_rse(rse: ResultRSE) -> ResultRSEImm12 {
73         match rse {
74             ResultRSE::Reg(r) => ResultRSEImm12::Reg(r),
75             ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s),
76             ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e),
77         }
78     }
79 }
80 
81 /// A lowering result: register, register-shift, or logical immediate form.
82 /// An SSA value can always be lowered into one of these options; the register form is the
83 /// fallback.
84 #[derive(Clone, Debug)]
85 pub(crate) enum ResultRSImmLogic {
86     Reg(Reg),
87     RegShift(Reg, ShiftOpAndAmt),
88     ImmLogic(ImmLogic),
89 }
90 
91 impl ResultRSImmLogic {
from_rs(rse: ResultRS) -> ResultRSImmLogic92     fn from_rs(rse: ResultRS) -> ResultRSImmLogic {
93         match rse {
94             ResultRS::Reg(r) => ResultRSImmLogic::Reg(r),
95             ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s),
96         }
97     }
98 }
99 
100 /// A lowering result: register or immediate shift amount (arg to a shift op).
101 /// An SSA value can always be lowered into one of these options; the register form is the
102 /// fallback.
103 #[derive(Clone, Debug)]
104 pub(crate) enum ResultRegImmShift {
105     Reg(Reg),
106     ImmShift(ImmShift),
107 }
108 
109 impl ResultRegImmShift {
unwrap_reg(self) -> Reg110     pub fn unwrap_reg(self) -> Reg {
111         match self {
112             ResultRegImmShift::Reg(r) => r,
113             _ => panic!("Unwrapped ResultRegImmShift, expected reg, got: {:?}", self),
114         }
115     }
116 }
117 
118 //============================================================================
119 // Lowering: convert instruction inputs to forms that we can use.
120 
121 /// Lower an instruction input to a 64-bit constant, if possible.
input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64>122 pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
123     let input = ctx.get_input_as_source_or_const(input.insn, input.input);
124     input.constant
125 }
126 
127 /// Lower an instruction input to a constant register-shift amount, if possible.
input_to_shiftimm<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> Option<ShiftOpShiftImm>128 pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
129     ctx: &mut C,
130     input: InsnInput,
131 ) -> Option<ShiftOpShiftImm> {
132     input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
133 }
134 
const_param_to_u128<C: LowerCtx<I = Inst>>( ctx: &mut C, inst: IRInst, ) -> Option<u128>135 pub(crate) fn const_param_to_u128<C: LowerCtx<I = Inst>>(
136     ctx: &mut C,
137     inst: IRInst,
138 ) -> Option<u128> {
139     match ctx.get_immediate(inst) {
140         Some(DataValue::V128(bytes)) => Some(u128::from_le_bytes(bytes)),
141         _ => None,
142     }
143 }
144 
145 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
146 /// parameter to `put_input_in_*` below.
147 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
148 pub(crate) enum NarrowValueMode {
149     None,
150     /// Zero-extend to 32 bits if original is < 32 bits.
151     ZeroExtend32,
152     /// Sign-extend to 32 bits if original is < 32 bits.
153     SignExtend32,
154     /// Zero-extend to 64 bits if original is < 64 bits.
155     ZeroExtend64,
156     /// Sign-extend to 64 bits if original is < 64 bits.
157     SignExtend64,
158 }
159 
160 impl NarrowValueMode {
is_32bit(&self) -> bool161     fn is_32bit(&self) -> bool {
162         match self {
163             NarrowValueMode::None => false,
164             NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true,
165             NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false,
166         }
167     }
168 
is_signed(&self) -> bool169     fn is_signed(&self) -> bool {
170         match self {
171             NarrowValueMode::SignExtend32 | NarrowValueMode::SignExtend64 => true,
172             NarrowValueMode::ZeroExtend32 | NarrowValueMode::ZeroExtend64 => false,
173             NarrowValueMode::None => false,
174         }
175     }
176 }
177 
178 /// Emits instruction(s) to generate the given constant value into newly-allocated
179 /// temporary registers, returning these registers.
generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg>180 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg> {
181     let from_bits = ty_bits(ty);
182     let masked = if from_bits < 128 {
183         c & ((1u128 << from_bits) - 1)
184     } else {
185         c
186     };
187 
188     let cst_copy = ctx.alloc_tmp(ty);
189     for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| {
190         ctx.alloc_tmp(ty).only_reg().unwrap()
191     })
192     .into_iter()
193     {
194         ctx.emit(inst);
195     }
196     non_writable_value_regs(cst_copy)
197 }
198 
199 /// Extends a register according to `narrow_mode`.
200 /// If extended, the value is always extended to 64 bits, for simplicity.
extend_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, ty: Type, in_reg: Reg, is_const: bool, narrow_mode: NarrowValueMode, ) -> Reg201 fn extend_reg<C: LowerCtx<I = Inst>>(
202     ctx: &mut C,
203     ty: Type,
204     in_reg: Reg,
205     is_const: bool,
206     narrow_mode: NarrowValueMode,
207 ) -> Reg {
208     let from_bits = ty_bits(ty) as u8;
209     match (narrow_mode, from_bits) {
210         (NarrowValueMode::None, _) => in_reg,
211         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
212             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
213             ctx.emit(Inst::Extend {
214                 rd: tmp,
215                 rn: in_reg,
216                 signed: false,
217                 from_bits,
218                 to_bits: 32,
219             });
220             tmp.to_reg()
221         }
222         (NarrowValueMode::SignExtend32, n) if n < 32 => {
223             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
224             ctx.emit(Inst::Extend {
225                 rd: tmp,
226                 rn: in_reg,
227                 signed: true,
228                 from_bits,
229                 to_bits: 32,
230             });
231             tmp.to_reg()
232         }
233         (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
234 
235         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
236             if is_const {
237                 // Constants are zero-extended to full 64-bit width on load already.
238                 in_reg
239             } else {
240                 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
241                 ctx.emit(Inst::Extend {
242                     rd: tmp,
243                     rn: in_reg,
244                     signed: false,
245                     from_bits,
246                     to_bits: 64,
247                 });
248                 tmp.to_reg()
249             }
250         }
251         (NarrowValueMode::SignExtend64, n) if n < 64 => {
252             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
253             ctx.emit(Inst::Extend {
254                 rd: tmp,
255                 rn: in_reg,
256                 signed: true,
257                 from_bits,
258                 to_bits: 64,
259             });
260             tmp.to_reg()
261         }
262         (_, 64) => in_reg,
263         (_, 128) => in_reg,
264 
265         _ => panic!(
266             "Unsupported input width: input ty {} bits {} mode {:?}",
267             ty, from_bits, narrow_mode
268         ),
269     }
270 }
271 
272 /// Lowers an instruction input to multiple regs
lower_input_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> (ValueRegs<Reg>, Type, bool)273 fn lower_input_to_regs<C: LowerCtx<I = Inst>>(
274     ctx: &mut C,
275     input: InsnInput,
276 ) -> (ValueRegs<Reg>, Type, bool) {
277     log::trace!("lower_input_to_regs: input {:?}", input);
278     let ty = ctx.input_ty(input.insn, input.input);
279     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
280     let is_const = inputs.constant.is_some();
281 
282     let in_regs = if let Some(c) = inputs.constant {
283         // Generate constants fresh at each use to minimize long-range register pressure.
284         generate_constant(ctx, ty, c as u128)
285     } else {
286         ctx.put_input_in_regs(input.insn, input.input)
287     };
288 
289     (in_regs, ty, is_const)
290 }
291 
292 /// Lower an instruction input to a register
293 ///
294 /// The given register will be extended appropriately, according to
295 /// `narrow_mode` and the input's type. If extended, the value is
296 /// always extended to 64 bits, for simplicity.
put_input_in_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> Reg297 pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
298     ctx: &mut C,
299     input: InsnInput,
300     narrow_mode: NarrowValueMode,
301 ) -> Reg {
302     let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input);
303     let reg = in_regs
304         .only_reg()
305         .expect("Multi-register value not expected");
306 
307     extend_reg(ctx, ty, reg, is_const, narrow_mode)
308 }
309 
310 /// Lower an instruction input to multiple regs
put_input_in_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> ValueRegs<Reg>311 pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
312     ctx: &mut C,
313     input: InsnInput,
314 ) -> ValueRegs<Reg> {
315     let (in_regs, _, _) = lower_input_to_regs(ctx, input);
316     in_regs
317 }
318 
319 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
320 ///
321 /// The `narrow_mode` flag indicates whether the consumer of this value needs
322 /// the high bits clear. For many operations, such as an add/sub/mul or any
323 /// bitwise logical operation, the low-bit results depend only on the low-bit
324 /// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit
325 /// value is stored in the low 8 bits of the register and the high 24 bits are
326 /// undefined. If the op truly needs the high N bits clear (such as for a
327 /// divide or a right-shift or a compare-to-zero), `narrow_mode` should be
328 /// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting
329 /// register will be provided the extended value.
put_input_in_rs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRS330 fn put_input_in_rs<C: LowerCtx<I = Inst>>(
331     ctx: &mut C,
332     input: InsnInput,
333     narrow_mode: NarrowValueMode,
334 ) -> ResultRS {
335     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
336     if let Some((insn, 0)) = inputs.inst {
337         let op = ctx.data(insn).opcode();
338 
339         if op == Opcode::Ishl {
340             let shiftee = InsnInput { insn, input: 0 };
341             let shift_amt = InsnInput { insn, input: 1 };
342 
343             // Can we get the shift amount as an immediate?
344             if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
345                 let shiftee_bits = ty_bits(ctx.input_ty(insn, 0));
346                 if shiftee_bits <= std::u8::MAX as usize {
347                     let shiftimm = shiftimm.mask(shiftee_bits as u8);
348                     let reg = put_input_in_reg(ctx, shiftee, narrow_mode);
349                     return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
350                 }
351             }
352         }
353     }
354 
355     ResultRS::Reg(put_input_in_reg(ctx, input, narrow_mode))
356 }
357 
358 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
359 /// This does not actually codegen the source instruction; it just uses the
360 /// vreg into which the source instruction will generate its value.
361 ///
362 /// See note on `put_input_in_rs` for a description of `narrow_mode`.
put_input_in_rse<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSE363 fn put_input_in_rse<C: LowerCtx<I = Inst>>(
364     ctx: &mut C,
365     input: InsnInput,
366     narrow_mode: NarrowValueMode,
367 ) -> ResultRSE {
368     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
369     if let Some((insn, 0)) = inputs.inst {
370         let op = ctx.data(insn).opcode();
371         let out_ty = ctx.output_ty(insn, 0);
372         let out_bits = ty_bits(out_ty);
373 
374         // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
375         if op == Opcode::Uextend || op == Opcode::Sextend {
376             let sign_extend = op == Opcode::Sextend;
377             let inner_ty = ctx.input_ty(insn, 0);
378             let inner_bits = ty_bits(inner_ty);
379             assert!(inner_bits < out_bits);
380             if match (sign_extend, narrow_mode) {
381                 // A single zero-extend or sign-extend is equal to itself.
382                 (_, NarrowValueMode::None) => true,
383                 // Two zero-extends or sign-extends in a row is equal to a single zero-extend or sign-extend.
384                 (false, NarrowValueMode::ZeroExtend32) | (false, NarrowValueMode::ZeroExtend64) => {
385                     true
386                 }
387                 (true, NarrowValueMode::SignExtend32) | (true, NarrowValueMode::SignExtend64) => {
388                     true
389                 }
390                 // A zero-extend and a sign-extend in a row is not equal to a single zero-extend or sign-extend
391                 (false, NarrowValueMode::SignExtend32) | (false, NarrowValueMode::SignExtend64) => {
392                     false
393                 }
394                 (true, NarrowValueMode::ZeroExtend32) | (true, NarrowValueMode::ZeroExtend64) => {
395                     false
396                 }
397             } {
398                 let extendop = match (sign_extend, inner_bits) {
399                     (true, 8) => ExtendOp::SXTB,
400                     (false, 8) => ExtendOp::UXTB,
401                     (true, 16) => ExtendOp::SXTH,
402                     (false, 16) => ExtendOp::UXTH,
403                     (true, 32) => ExtendOp::SXTW,
404                     (false, 32) => ExtendOp::UXTW,
405                     _ => unreachable!(),
406                 };
407                 let reg =
408                     put_input_in_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
409                 return ResultRSE::RegExtend(reg, extendop);
410             }
411         }
412 
413         // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
414         // then get the result into a register and return an Extend-mode operand on
415         // that register.
416         if narrow_mode != NarrowValueMode::None
417             && ((narrow_mode.is_32bit() && out_bits < 32)
418                 || (!narrow_mode.is_32bit() && out_bits < 64))
419         {
420             let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
421             let extendop = match (narrow_mode, out_bits) {
422                 (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
423                     ExtendOp::SXTB
424                 }
425                 (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => {
426                     ExtendOp::UXTB
427                 }
428                 (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => {
429                     ExtendOp::SXTB
430                 }
431                 (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => {
432                     ExtendOp::UXTB
433                 }
434                 (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => {
435                     ExtendOp::SXTH
436                 }
437                 (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => {
438                     ExtendOp::UXTH
439                 }
440                 (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW,
441                 (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
442                 _ => unreachable!(),
443             };
444             return ResultRSE::RegExtend(reg, extendop);
445         }
446     }
447 
448     ResultRSE::from_rs(put_input_in_rs(ctx, input, narrow_mode))
449 }
450 
put_input_in_rse_imm12<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSEImm12451 pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>(
452     ctx: &mut C,
453     input: InsnInput,
454     narrow_mode: NarrowValueMode,
455 ) -> ResultRSEImm12 {
456     if let Some(imm_value) = input_to_const(ctx, input) {
457         if let Some(i) = Imm12::maybe_from_u64(imm_value) {
458             let out_ty_bits = ty_bits(ctx.input_ty(input.insn, input.input));
459             let is_negative = (i.bits as u64) & (1 << (cmp::max(out_ty_bits, 1) - 1)) != 0;
460 
461             // This condition can happen if we matched a value that overflows the output type of
462             // its `iconst` when viewed as a signed value (i.e. iconst.i8 200).
463             // When that happens we need to lower as a negative value, which we cannot do here.
464             if !(narrow_mode.is_signed() && is_negative) {
465                 return ResultRSEImm12::Imm12(i);
466             }
467         }
468     }
469 
470     ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode))
471 }
472 
473 /// Like `put_input_in_rse_imm12` above, except is allowed to negate the
474 /// argument (assuming a two's-complement representation with the given bit
475 /// width) if this allows use of 12-bit immediate. Used to flip `add`s with
476 /// negative immediates to `sub`s (and vice-versa).
put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, twos_complement_bits: usize, narrow_mode: NarrowValueMode, ) -> (ResultRSEImm12, bool)477 pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>(
478     ctx: &mut C,
479     input: InsnInput,
480     twos_complement_bits: usize,
481     narrow_mode: NarrowValueMode,
482 ) -> (ResultRSEImm12, bool) {
483     assert!(twos_complement_bits <= 64);
484     if let Some(imm_value) = input_to_const(ctx, input) {
485         if let Some(i) = Imm12::maybe_from_u64(imm_value) {
486             return (ResultRSEImm12::Imm12(i), false);
487         }
488         let sign_extended =
489             ((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits);
490         let inverted = sign_extended.wrapping_neg();
491         if let Some(i) = Imm12::maybe_from_u64(inverted as u64) {
492             return (ResultRSEImm12::Imm12(i), true);
493         }
494     }
495 
496     (
497         ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)),
498         false,
499     )
500 }
501 
put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSImmLogic502 pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>(
503     ctx: &mut C,
504     input: InsnInput,
505     narrow_mode: NarrowValueMode,
506 ) -> ResultRSImmLogic {
507     if let Some(imm_value) = input_to_const(ctx, input) {
508         let ty = ctx.input_ty(input.insn, input.input);
509         let ty = if ty_bits(ty) < 32 { I32 } else { ty };
510         if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
511             return ResultRSImmLogic::ImmLogic(i);
512         }
513     }
514 
515     ResultRSImmLogic::from_rs(put_input_in_rs(ctx, input, narrow_mode))
516 }
517 
put_input_in_reg_immshift<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, shift_width_bits: usize, ) -> ResultRegImmShift518 pub(crate) fn put_input_in_reg_immshift<C: LowerCtx<I = Inst>>(
519     ctx: &mut C,
520     input: InsnInput,
521     shift_width_bits: usize,
522 ) -> ResultRegImmShift {
523     if let Some(imm_value) = input_to_const(ctx, input) {
524         let imm_value = imm_value & ((shift_width_bits - 1) as u64);
525         if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
526             return ResultRegImmShift::ImmShift(immshift);
527         }
528     }
529 
530     ResultRegImmShift::Reg(put_input_in_reg(ctx, input, NarrowValueMode::None))
531 }
532 
533 //============================================================================
534 // ALU instruction constructors.
535 
alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst536 pub(crate) fn alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst {
537     match rm {
538         ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 {
539             alu_op: op,
540             rd,
541             rn,
542             imm12,
543         },
544         ResultRSEImm12::Reg(rm) => Inst::AluRRR {
545             alu_op: op,
546             rd,
547             rn,
548             rm,
549         },
550         ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift {
551             alu_op: op,
552             rd,
553             rn,
554             rm,
555             shiftop,
556         },
557         ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend {
558             alu_op: op,
559             rd,
560             rn,
561             rm,
562             extendop,
563         },
564     }
565 }
566 
alu_inst_immlogic( op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSImmLogic, ) -> Inst567 pub(crate) fn alu_inst_immlogic(
568     op: ALUOp,
569     rd: Writable<Reg>,
570     rn: Reg,
571     rm: ResultRSImmLogic,
572 ) -> Inst {
573     match rm {
574         ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic {
575             alu_op: op,
576             rd,
577             rn,
578             imml,
579         },
580         ResultRSImmLogic::Reg(rm) => Inst::AluRRR {
581             alu_op: op,
582             rd,
583             rn,
584             rm,
585         },
586         ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift {
587             alu_op: op,
588             rd,
589             rn,
590             rm,
591             shiftop,
592         },
593     }
594 }
595 
alu_inst_immshift( op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRegImmShift, ) -> Inst596 pub(crate) fn alu_inst_immshift(
597     op: ALUOp,
598     rd: Writable<Reg>,
599     rn: Reg,
600     rm: ResultRegImmShift,
601 ) -> Inst {
602     match rm {
603         ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift {
604             alu_op: op,
605             rd,
606             rn,
607             immshift,
608         },
609         ResultRegImmShift::Reg(rm) => Inst::AluRRR {
610             alu_op: op,
611             rd,
612             rn,
613             rm,
614         },
615     }
616 }
617 
618 //============================================================================
619 // Lowering: addressing mode support. Takes instruction directly, rather
620 // than an `InsnInput`, to do more introspection.
621 
622 /// 32-bit addends that make up an address: an input, and an extension mode on that
623 /// input.
624 type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>;
625 /// 64-bit addends that make up an address: just an input.
626 type AddressAddend64List = SmallVec<[Reg; 4]>;
627 
628 /// Collect all addends that feed into an address computation, with extend-modes
629 /// on each.  Note that a load/store may have multiple address components (and
630 /// the CLIF semantics are that these components are added to form the final
631 /// address), but sometimes the CLIF that we receive still has arguments that
632 /// refer to `iadd` instructions. We also want to handle uextend/sextend below
633 /// the add(s).
634 ///
635 /// We match any 64-bit add (and descend into its inputs), and we match any
636 /// 32-to-64-bit sign or zero extension. The returned addend-list will use
637 /// NarrowValueMode values to indicate how to extend each input:
638 ///
639 /// - NarrowValueMode::None: the associated input is 64 bits wide; no extend.
640 /// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide;
641 ///                                  do a sign-extension.
642 /// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide;
643 ///                                  do a zero-extension.
644 ///
645 /// We do not descend further into the inputs of extensions (unless it is a constant),
646 /// because supporting (e.g.) a 32-bit add that is later extended would require
647 /// additional masking of high-order bits, which is too complex. So, in essence, we
648 /// descend any number of adds from the roots, collecting all 64-bit address addends;
649 /// then possibly support extensions at these leaves.
collect_address_addends<C: LowerCtx<I = Inst>>( ctx: &mut C, roots: &[InsnInput], ) -> (AddressAddend64List, AddressAddend32List, i64)650 fn collect_address_addends<C: LowerCtx<I = Inst>>(
651     ctx: &mut C,
652     roots: &[InsnInput],
653 ) -> (AddressAddend64List, AddressAddend32List, i64) {
654     let mut result32: AddressAddend32List = SmallVec::new();
655     let mut result64: AddressAddend64List = SmallVec::new();
656     let mut offset: i64 = 0;
657 
658     let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
659 
660     while let Some(input) = workqueue.pop() {
661         debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
662         if let Some((op, insn)) = maybe_input_insn_multi(
663             ctx,
664             input,
665             &[
666                 Opcode::Uextend,
667                 Opcode::Sextend,
668                 Opcode::Iadd,
669                 Opcode::Iconst,
670             ],
671         ) {
672             match op {
673                 Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => {
674                     let extendop = if op == Opcode::Uextend {
675                         ExtendOp::UXTW
676                     } else {
677                         ExtendOp::SXTW
678                     };
679                     let extendee_input = InsnInput { insn, input: 0 };
680                     // If the input is a zero-extension of a constant, add the value to the known
681                     // offset.
682                     // Only do this for zero-extension, as generating a sign-extended
683                     // constant may be more instructions than using the 'SXTW' addressing mode.
684                     if let (Some(insn), ExtendOp::UXTW) = (
685                         maybe_input_insn(ctx, extendee_input, Opcode::Iconst),
686                         extendop,
687                     ) {
688                         let value = (ctx.get_constant(insn).unwrap() & 0xFFFF_FFFF_u64) as i64;
689                         offset += value;
690                     } else {
691                         let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None);
692                         result32.push((reg, extendop));
693                     }
694                 }
695                 Opcode::Uextend | Opcode::Sextend => {
696                     let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
697                     result64.push(reg);
698                 }
699                 Opcode::Iadd => {
700                     for input in 0..ctx.num_inputs(insn) {
701                         let addend = InsnInput { insn, input };
702                         workqueue.push(addend);
703                     }
704                 }
705                 Opcode::Iconst => {
706                     let value: i64 = ctx.get_constant(insn).unwrap() as i64;
707                     offset += value;
708                 }
709                 _ => panic!("Unexpected opcode from maybe_input_insn_multi"),
710             }
711         } else {
712             let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
713             result64.push(reg);
714         }
715     }
716 
717     (result64, result32, offset)
718 }
719 
720 /// Lower the address of a pair load or store.
lower_pair_address<C: LowerCtx<I = Inst>>( ctx: &mut C, roots: &[InsnInput], offset: i32, ) -> PairAMode721 pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
722     ctx: &mut C,
723     roots: &[InsnInput],
724     offset: i32,
725 ) -> PairAMode {
726     // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
727     // extends and addition ops. We update these as we consume address
728     // components, so they represent the remaining addends not yet handled.
729     let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
730     let offset = args_offset + (offset as i64);
731 
732     log::trace!(
733         "lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
734         addends64,
735         addends32,
736         offset
737     );
738 
739     // Pairs basically only have reg + imm formats so we only have to worry about those
740 
741     let base_reg = if let Some(reg64) = addends64.pop() {
742         reg64
743     } else if let Some((reg32, extendop)) = addends32.pop() {
744         let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
745         let signed = match extendop {
746             ExtendOp::SXTW => true,
747             ExtendOp::UXTW => false,
748             _ => unreachable!(),
749         };
750         ctx.emit(Inst::Extend {
751             rd: tmp,
752             rn: reg32,
753             signed,
754             from_bits: 32,
755             to_bits: 64,
756         });
757         tmp.to_reg()
758     } else {
759         zero_reg()
760     };
761 
762     let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
763     ctx.emit(Inst::gen_move(addr, base_reg, I64));
764 
765     // We have the base register, if we have any others, we need to add them
766     lower_add_addends(ctx, addr, addends64, addends32);
767 
768     // Figure out what offset we should emit
769     let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| {
770         lower_add_immediate(ctx, addr, addr.to_reg(), offset);
771         SImm7Scaled::maybe_from_i64(0, I64).unwrap()
772     });
773 
774     PairAMode::SignedOffset(addr.to_reg(), imm7)
775 }
776 
777 /// Lower the address of a load or store.
lower_address<C: LowerCtx<I = Inst>>( ctx: &mut C, elem_ty: Type, roots: &[InsnInput], offset: i32, ) -> AMode778 pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
779     ctx: &mut C,
780     elem_ty: Type,
781     roots: &[InsnInput],
782     offset: i32,
783 ) -> AMode {
784     // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
785     // mul instructions (Load/StoreComplex don't include scale factors).
786 
787     // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
788     // extends and addition ops. We update these as we consume address
789     // components, so they represent the remaining addends not yet handled.
790     let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
791     let mut offset = args_offset + (offset as i64);
792 
793     log::trace!(
794         "lower_address: addends64 {:?}, addends32 {:?}, offset {}",
795         addends64,
796         addends32,
797         offset
798     );
799 
800     // First, decide what the `AMode` will be. Take one extendee and one 64-bit
801     // reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension,
802     // or some other combination as appropriate.
803     let memarg = if addends64.len() > 0 {
804         if addends32.len() > 0 {
805             let (reg32, extendop) = addends32.pop().unwrap();
806             let reg64 = addends64.pop().unwrap();
807             AMode::RegExtended(reg64, reg32, extendop)
808         } else if offset > 0 && offset < 0x1000 {
809             let reg64 = addends64.pop().unwrap();
810             let off = offset;
811             offset = 0;
812             AMode::RegOffset(reg64, off, elem_ty)
813         } else if addends64.len() >= 2 {
814             let reg1 = addends64.pop().unwrap();
815             let reg2 = addends64.pop().unwrap();
816             AMode::RegReg(reg1, reg2)
817         } else {
818             let reg1 = addends64.pop().unwrap();
819             AMode::reg(reg1)
820         }
821     } else
822     /* addends64.len() == 0 */
823     {
824         if addends32.len() > 0 {
825             let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
826             let (reg1, extendop) = addends32.pop().unwrap();
827             let signed = match extendop {
828                 ExtendOp::SXTW => true,
829                 ExtendOp::UXTW => false,
830                 _ => unreachable!(),
831             };
832             ctx.emit(Inst::Extend {
833                 rd: tmp,
834                 rn: reg1,
835                 signed,
836                 from_bits: 32,
837                 to_bits: 64,
838             });
839             if let Some((reg2, extendop)) = addends32.pop() {
840                 AMode::RegExtended(tmp.to_reg(), reg2, extendop)
841             } else {
842                 AMode::reg(tmp.to_reg())
843             }
844         } else
845         /* addends32.len() == 0 */
846         {
847             let off_reg = ctx.alloc_tmp(I64).only_reg().unwrap();
848             lower_constant_u64(ctx, off_reg, offset as u64);
849             offset = 0;
850             AMode::reg(off_reg.to_reg())
851         }
852     };
853 
854     // At this point, if we have any remaining components, we need to allocate a
855     // temp, replace one of the registers in the AMode with the temp, and emit
856     // instructions to add together the remaining components. Return immediately
857     // if this is *not* the case.
858     if offset == 0 && addends32.len() == 0 && addends64.len() == 0 {
859         return memarg;
860     }
861 
862     // Allocate the temp and shoehorn it into the AMode.
863     let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
864     let (reg, memarg) = match memarg {
865         AMode::RegExtended(r1, r2, extendop) => {
866             (r1, AMode::RegExtended(addr.to_reg(), r2, extendop))
867         }
868         AMode::RegOffset(r, off, ty) => (r, AMode::RegOffset(addr.to_reg(), off, ty)),
869         AMode::RegReg(r1, r2) => (r2, AMode::RegReg(addr.to_reg(), r1)),
870         AMode::UnsignedOffset(r, imm) => (r, AMode::UnsignedOffset(addr.to_reg(), imm)),
871         _ => unreachable!(),
872     };
873 
874     // If there is any offset, load that first into `addr`, and add the `reg`
875     // that we kicked out of the `AMode`; otherwise, start with that reg.
876     if offset != 0 {
877         lower_add_immediate(ctx, addr, reg, offset)
878     } else {
879         ctx.emit(Inst::gen_move(addr, reg, I64));
880     }
881 
882     // Now handle reg64 and reg32-extended components.
883     lower_add_addends(ctx, addr, addends64, addends32);
884 
885     memarg
886 }
887 
lower_add_addends<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, addends64: AddressAddend64List, addends32: AddressAddend32List, )888 fn lower_add_addends<C: LowerCtx<I = Inst>>(
889     ctx: &mut C,
890     rd: Writable<Reg>,
891     addends64: AddressAddend64List,
892     addends32: AddressAddend32List,
893 ) {
894     for reg in addends64 {
895         // If the register is the stack reg, we must move it to another reg
896         // before adding it.
897         let reg = if reg == stack_reg() {
898             let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
899             ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
900             tmp.to_reg()
901         } else {
902             reg
903         };
904         ctx.emit(Inst::AluRRR {
905             alu_op: ALUOp::Add64,
906             rd,
907             rn: rd.to_reg(),
908             rm: reg,
909         });
910     }
911     for (reg, extendop) in addends32 {
912         assert!(reg != stack_reg());
913         ctx.emit(Inst::AluRRRExtend {
914             alu_op: ALUOp::Add64,
915             rd,
916             rn: rd.to_reg(),
917             rm: reg,
918             extendop,
919         });
920     }
921 }
922 
923 /// Adds into `rd` a signed imm pattern matching the best instruction for it.
924 // TODO: This function is duplicated in ctx.gen_add_imm
lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64)925 fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
926     // If we can fit offset or -offset in an imm12, use an add-imm
927     // Otherwise, lower the constant first then add.
928     if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
929         ctx.emit(Inst::AluRRImm12 {
930             alu_op: ALUOp::Add64,
931             rd: dst,
932             rn: src,
933             imm12,
934         });
935     } else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
936         ctx.emit(Inst::AluRRImm12 {
937             alu_op: ALUOp::Sub64,
938             rd: dst,
939             rn: src,
940             imm12,
941         });
942     } else {
943         lower_constant_u64(ctx, dst, imm as u64);
944         ctx.emit(Inst::AluRRR {
945             alu_op: ALUOp::Add64,
946             rd: dst,
947             rn: dst.to_reg(),
948             rm: src,
949         });
950     }
951 }
952 
lower_constant_u64<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u64, )953 pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
954     ctx: &mut C,
955     rd: Writable<Reg>,
956     value: u64,
957 ) {
958     for inst in Inst::load_constant(rd, value) {
959         ctx.emit(inst);
960     }
961 }
962 
lower_constant_f32<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: f32, )963 pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
964     ctx: &mut C,
965     rd: Writable<Reg>,
966     value: f32,
967 ) {
968     let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
969 
970     for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
971         ctx.emit(inst);
972     }
973 }
974 
lower_constant_f64<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: f64, )975 pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
976     ctx: &mut C,
977     rd: Writable<Reg>,
978     value: f64,
979 ) {
980     let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
981 
982     for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
983         ctx.emit(inst);
984     }
985 }
986 
lower_constant_f128<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u128, )987 pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
988     ctx: &mut C,
989     rd: Writable<Reg>,
990     value: u128,
991 ) {
992     if value == 0 {
993         // Fast-track a common case.  The general case, viz, calling `Inst::load_fp_constant128`,
994         // is potentially expensive.
995         ctx.emit(Inst::VecDupImm {
996             rd,
997             imm: ASIMDMovModImm::zero(ScalarSize::Size8),
998             invert: false,
999             size: VectorSize::Size8x16,
1000         });
1001     } else {
1002         let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
1003         for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
1004             ctx.emit(inst);
1005         }
1006     }
1007 }
1008 
lower_splat_const<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u64, size: VectorSize, )1009 pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
1010     ctx: &mut C,
1011     rd: Writable<Reg>,
1012     value: u64,
1013     size: VectorSize,
1014 ) {
1015     let (value, narrow_size) = match size.lane_size() {
1016         ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
1017         ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
1018         ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
1019         ScalarSize::Size64 => (value, ScalarSize::Size32),
1020         _ => unreachable!(),
1021     };
1022     let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
1023         Some((value, lane_size)) => (
1024             value,
1025             VectorSize::from_lane_size(lane_size, size.is_128bits()),
1026         ),
1027         None => (value, size),
1028     };
1029     let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
1030 
1031     for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
1032         ctx.emit(inst);
1033     }
1034 }
1035 
lower_condcode(cc: IntCC) -> Cond1036 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
1037     match cc {
1038         IntCC::Equal => Cond::Eq,
1039         IntCC::NotEqual => Cond::Ne,
1040         IntCC::SignedGreaterThanOrEqual => Cond::Ge,
1041         IntCC::SignedGreaterThan => Cond::Gt,
1042         IntCC::SignedLessThanOrEqual => Cond::Le,
1043         IntCC::SignedLessThan => Cond::Lt,
1044         IntCC::UnsignedGreaterThanOrEqual => Cond::Hs,
1045         IntCC::UnsignedGreaterThan => Cond::Hi,
1046         IntCC::UnsignedLessThanOrEqual => Cond::Ls,
1047         IntCC::UnsignedLessThan => Cond::Lo,
1048         IntCC::Overflow => Cond::Vs,
1049         IntCC::NotOverflow => Cond::Vc,
1050     }
1051 }
1052 
lower_fp_condcode(cc: FloatCC) -> Cond1053 pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond {
1054     // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs.
1055     // The FCMP instruction sets:
1056     //               NZCV
1057     // - PCSR.NZCV = 0011 on UN (unordered),
1058     //               0110 on EQ,
1059     //               1000 on LT,
1060     //               0010 on GT.
1061     match cc {
1062         // EQ | LT | GT. Vc => V clear.
1063         FloatCC::Ordered => Cond::Vc,
1064         // UN. Vs => V set.
1065         FloatCC::Unordered => Cond::Vs,
1066         // EQ. Eq => Z set.
1067         FloatCC::Equal => Cond::Eq,
1068         // UN | LT | GT. Ne => Z clear.
1069         FloatCC::NotEqual => Cond::Ne,
1070         // LT | GT.
1071         FloatCC::OrderedNotEqual => unimplemented!(),
1072         //  UN | EQ
1073         FloatCC::UnorderedOrEqual => unimplemented!(),
1074         // LT. Mi => N set.
1075         FloatCC::LessThan => Cond::Mi,
1076         // LT | EQ. Ls => C clear or Z set.
1077         FloatCC::LessThanOrEqual => Cond::Ls,
1078         // GT. Gt => Z clear, N = V.
1079         FloatCC::GreaterThan => Cond::Gt,
1080         // GT | EQ. Ge => N = V.
1081         FloatCC::GreaterThanOrEqual => Cond::Ge,
1082         // UN | LT
1083         FloatCC::UnorderedOrLessThan => unimplemented!(),
1084         // UN | LT | EQ
1085         FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(),
1086         // UN | GT
1087         FloatCC::UnorderedOrGreaterThan => unimplemented!(),
1088         // UN | GT | EQ
1089         FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(),
1090     }
1091 }
1092 
lower_vector_compare<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, mut rn: Reg, mut rm: Reg, ty: Type, cond: Cond, ) -> CodegenResult<()>1093 pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
1094     ctx: &mut C,
1095     rd: Writable<Reg>,
1096     mut rn: Reg,
1097     mut rm: Reg,
1098     ty: Type,
1099     cond: Cond,
1100 ) -> CodegenResult<()> {
1101     let is_float = match ty {
1102         F32X4 | F64X2 => true,
1103         _ => false,
1104     };
1105     let size = VectorSize::from_ty(ty);
1106     // 'Less than' operations are implemented by swapping
1107     // the order of operands and using the 'greater than'
1108     // instructions.
1109     // 'Not equal' is implemented with 'equal' and inverting
1110     // the result.
1111     let (alu_op, swap) = match (is_float, cond) {
1112         (false, Cond::Eq) => (VecALUOp::Cmeq, false),
1113         (false, Cond::Ne) => (VecALUOp::Cmeq, false),
1114         (false, Cond::Ge) => (VecALUOp::Cmge, false),
1115         (false, Cond::Gt) => (VecALUOp::Cmgt, false),
1116         (false, Cond::Le) => (VecALUOp::Cmge, true),
1117         (false, Cond::Lt) => (VecALUOp::Cmgt, true),
1118         (false, Cond::Hs) => (VecALUOp::Cmhs, false),
1119         (false, Cond::Hi) => (VecALUOp::Cmhi, false),
1120         (false, Cond::Ls) => (VecALUOp::Cmhs, true),
1121         (false, Cond::Lo) => (VecALUOp::Cmhi, true),
1122         (true, Cond::Eq) => (VecALUOp::Fcmeq, false),
1123         (true, Cond::Ne) => (VecALUOp::Fcmeq, false),
1124         (true, Cond::Mi) => (VecALUOp::Fcmgt, true),
1125         (true, Cond::Ls) => (VecALUOp::Fcmge, true),
1126         (true, Cond::Ge) => (VecALUOp::Fcmge, false),
1127         (true, Cond::Gt) => (VecALUOp::Fcmgt, false),
1128         _ => unreachable!(),
1129     };
1130 
1131     if swap {
1132         std::mem::swap(&mut rn, &mut rm);
1133     }
1134 
1135     ctx.emit(Inst::VecRRR {
1136         alu_op,
1137         rd,
1138         rn,
1139         rm,
1140         size,
1141     });
1142 
1143     if cond == Cond::Ne {
1144         ctx.emit(Inst::VecMisc {
1145             op: VecMisc2::Not,
1146             rd,
1147             rn: rd.to_reg(),
1148             size,
1149         });
1150     }
1151 
1152     Ok(())
1153 }
1154 
1155 /// Determines whether this condcode interprets inputs as signed or unsigned.  See the
1156 /// documentation for the `icmp` instruction in cranelift-codegen/meta/src/shared/instructions.rs
1157 /// for further insights into this.
condcode_is_signed(cc: IntCC) -> bool1158 pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
1159     match cc {
1160         IntCC::Equal
1161         | IntCC::UnsignedGreaterThanOrEqual
1162         | IntCC::UnsignedGreaterThan
1163         | IntCC::UnsignedLessThanOrEqual
1164         | IntCC::UnsignedLessThan
1165         | IntCC::NotEqual => false,
1166         IntCC::SignedGreaterThanOrEqual
1167         | IntCC::SignedGreaterThan
1168         | IntCC::SignedLessThanOrEqual
1169         | IntCC::SignedLessThan
1170         | IntCC::Overflow
1171         | IntCC::NotOverflow => true,
1172     }
1173 }
1174 
1175 //=============================================================================
1176 // Helpers for instruction lowering.
1177 
choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T1178 pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
1179     let bits = ty_bits(ty);
1180     if bits <= 32 {
1181         op32
1182     } else if bits == 64 {
1183         op64
1184     } else {
1185         panic!("choose_32_64 on > 64 bits!")
1186     }
1187 }
1188 
1189 /// Checks for an instance of `op` feeding the given input.
maybe_input_insn<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, op: Opcode, ) -> Option<IRInst>1190 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
1191     c: &mut C,
1192     input: InsnInput,
1193     op: Opcode,
1194 ) -> Option<IRInst> {
1195     let inputs = c.get_input_as_source_or_const(input.insn, input.input);
1196     log::trace!(
1197         "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
1198         input,
1199         inputs,
1200         op
1201     );
1202     if let Some((src_inst, _)) = inputs.inst {
1203         let data = c.data(src_inst);
1204         log::trace!(" -> input inst {:?}", data);
1205         if data.opcode() == op {
1206             return Some(src_inst);
1207         }
1208     }
1209     None
1210 }
1211 
1212 /// Checks for an instance of any one of `ops` feeding the given input.
maybe_input_insn_multi<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, ops: &[Opcode], ) -> Option<(Opcode, IRInst)>1213 pub(crate) fn maybe_input_insn_multi<C: LowerCtx<I = Inst>>(
1214     c: &mut C,
1215     input: InsnInput,
1216     ops: &[Opcode],
1217 ) -> Option<(Opcode, IRInst)> {
1218     for &op in ops {
1219         if let Some(inst) = maybe_input_insn(c, input, op) {
1220             return Some((op, inst));
1221         }
1222     }
1223     None
1224 }
1225 
1226 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
1227 /// Bint or a bitcast).
1228 ///
1229 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
1230 /// a bit more generic.
maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, op: Opcode, conv: Opcode, ) -> Option<IRInst>1231 pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
1232     c: &mut C,
1233     input: InsnInput,
1234     op: Opcode,
1235     conv: Opcode,
1236 ) -> Option<IRInst> {
1237     let inputs = c.get_input_as_source_or_const(input.insn, input.input);
1238     if let Some((src_inst, _)) = inputs.inst {
1239         let data = c.data(src_inst);
1240         if data.opcode() == op {
1241             return Some(src_inst);
1242         }
1243         if data.opcode() == conv {
1244             let inputs = c.get_input_as_source_or_const(src_inst, 0);
1245             if let Some((src_inst, _)) = inputs.inst {
1246                 let data = c.data(src_inst);
1247                 if data.opcode() == op {
1248                     return Some(src_inst);
1249                 }
1250             }
1251         }
1252     }
1253     None
1254 }
1255 
1256 /// Pattern match an extending vector multiplication.
1257 /// Returns a tuple of the opcode to use, the two input registers and whether
1258 /// it's the 'high half' version of the instruction.
match_vec_long_mul<C: LowerCtx<I = Inst>>( c: &mut C, insn: IRInst, ext_op: Opcode, ) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)>1259 pub(crate) fn match_vec_long_mul<C: LowerCtx<I = Inst>>(
1260     c: &mut C,
1261     insn: IRInst,
1262     ext_op: Opcode,
1263 ) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> {
1264     let inputs = insn_inputs(c, insn);
1265     if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) {
1266         if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) {
1267             let lhs_input = insn_inputs(c, lhs)[0];
1268             let rhs_input = insn_inputs(c, rhs)[0];
1269             let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None);
1270             let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None);
1271             let lane_type = c.output_ty(insn, 0).lane_type();
1272             match (lane_type, ext_op) {
1273                 (I16, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull8, rn, rm, false)),
1274                 (I16, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull8, rn, rm, true)),
1275                 (I16, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull8, rn, rm, false)),
1276                 (I16, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull8, rn, rm, true)),
1277                 (I32, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull16, rn, rm, false)),
1278                 (I32, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull16, rn, rm, true)),
1279                 (I32, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull16, rn, rm, false)),
1280                 (I32, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull16, rn, rm, true)),
1281                 (I64, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull32, rn, rm, false)),
1282                 (I64, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull32, rn, rm, true)),
1283                 (I64, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull32, rn, rm, false)),
1284                 (I64, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull32, rn, rm, true)),
1285                 _ => {}
1286             };
1287         }
1288     }
1289     None
1290 }
1291 
lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst)1292 pub(crate) fn lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst) {
1293     let inputs = insn_inputs(c, insn);
1294     let outputs = insn_outputs(c, insn);
1295     let rd = get_output_reg(c, outputs[0]).regs()[0];
1296     let rn = put_input_in_regs(c, inputs[0]).regs()[0];
1297     let rm = put_input_in_regs(c, inputs[1]).regs()[0];
1298 
1299     let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap();
1300     let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap();
1301 
1302     // This I64X2 multiplication is performed with several 32-bit
1303     // operations.
1304 
1305     // 64-bit numbers x and y, can be represented as:
1306     //   x = a + 2^32(b)
1307     //   y = c + 2^32(d)
1308 
1309     // A 64-bit multiplication is:
1310     //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
1311     // note: `2^64(bd)` can be ignored, the value is too large to fit in
1312     // 64 bits.
1313 
1314     // This sequence implements a I64X2 multiply, where the registers
1315     // `rn` and `rm` are split up into 32-bit components:
1316     //   rn = |d|c|b|a|
1317     //   rm = |h|g|f|e|
1318     //
1319     //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
1320     //
1321     //  The sequence is:
1322     //  rev64 rd.4s, rm.4s
1323     //  mul rd.4s, rd.4s, rn.4s
1324     //  xtn tmp1.2s, rn.2d
1325     //  addp rd.4s, rd.4s, rd.4s
1326     //  xtn tmp2.2s, rm.2d
1327     //  shll rd.2d, rd.2s, #32
1328     //  umlal rd.2d, tmp2.2s, tmp1.2s
1329 
1330     // Reverse the 32-bit elements in the 64-bit words.
1331     //   rd = |g|h|e|f|
1332     c.emit(Inst::VecMisc {
1333         op: VecMisc2::Rev64,
1334         rd,
1335         rn: rm,
1336         size: VectorSize::Size32x4,
1337     });
1338 
1339     // Calculate the high half components.
1340     //   rd = |dg|ch|be|af|
1341     //
1342     // Note that this 32-bit multiply of the high half
1343     // discards the bits that would overflow, same as
1344     // if 64-bit operations were used. Also the Shll
1345     // below would shift out the overflow bits anyway.
1346     c.emit(Inst::VecRRR {
1347         alu_op: VecALUOp::Mul,
1348         rd,
1349         rn: rd.to_reg(),
1350         rm: rn,
1351         size: VectorSize::Size32x4,
1352     });
1353 
1354     // Extract the low half components of rn.
1355     //   tmp1 = |c|a|
1356     c.emit(Inst::VecRRNarrow {
1357         op: VecRRNarrowOp::Xtn64,
1358         rd: tmp1,
1359         rn,
1360         high_half: false,
1361     });
1362 
1363     // Sum the respective high half components.
1364     //   rd = |dg+ch|be+af||dg+ch|be+af|
1365     c.emit(Inst::VecRRR {
1366         alu_op: VecALUOp::Addp,
1367         rd: rd,
1368         rn: rd.to_reg(),
1369         rm: rd.to_reg(),
1370         size: VectorSize::Size32x4,
1371     });
1372 
1373     // Extract the low half components of rm.
1374     //   tmp2 = |g|e|
1375     c.emit(Inst::VecRRNarrow {
1376         op: VecRRNarrowOp::Xtn64,
1377         rd: tmp2,
1378         rn: rm,
1379         high_half: false,
1380     });
1381 
1382     // Shift the high half components, into the high half.
1383     //   rd = |dg+ch << 32|be+af << 32|
1384     c.emit(Inst::VecRRLong {
1385         op: VecRRLongOp::Shll32,
1386         rd,
1387         rn: rd.to_reg(),
1388         high_half: false,
1389     });
1390 
1391     // Multiply the low components together, and accumulate with the high
1392     // half.
1393     //   rd = |rd[1] + cg|rd[0] + ae|
1394     c.emit(Inst::VecRRRLong {
1395         alu_op: VecRRRLongOp::Umlal32,
1396         rd,
1397         rn: tmp2.to_reg(),
1398         rm: tmp1.to_reg(),
1399         high_half: false,
1400     });
1401 }
1402 
1403 /// Specifies what [lower_icmp] should do when lowering
1404 #[derive(Debug, Clone, PartialEq)]
1405 pub(crate) enum IcmpOutput {
1406     /// Lowers the comparison into a cond code, discarding the results. The cond code emitted can
1407     /// be checked in the resulting [IcmpResult].
1408     CondCode,
1409     /// Materializes the results into a register. This may overwrite any flags previously set.
1410     Register(Writable<Reg>),
1411 }
1412 
1413 impl IcmpOutput {
reg(&self) -> Option<Writable<Reg>>1414     pub fn reg(&self) -> Option<Writable<Reg>> {
1415         match self {
1416             IcmpOutput::CondCode => None,
1417             IcmpOutput::Register(reg) => Some(*reg),
1418         }
1419     }
1420 }
1421 
1422 /// The output of an Icmp lowering.
1423 #[derive(Debug, Clone, PartialEq)]
1424 pub(crate) enum IcmpResult {
1425     /// The result was output into the given [Cond]. Callers may perform operations using this [Cond]
1426     /// and its inverse, other [Cond]'s are not guaranteed to be correct.
1427     CondCode(Cond),
1428     /// The result was materialized into the output register.
1429     Register,
1430 }
1431 
1432 impl IcmpResult {
unwrap_cond(&self) -> Cond1433     pub fn unwrap_cond(&self) -> Cond {
1434         match self {
1435             IcmpResult::CondCode(c) => *c,
1436             _ => panic!("Unwrapped cond, but IcmpResult was {:?}", self),
1437         }
1438     }
1439 }
1440 
1441 /// Lower an icmp comparision
1442 ///
1443 /// We can lower into the status flags, or materialize the result into a register
1444 /// This is controlled by the `output` parameter.
lower_icmp<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, condcode: IntCC, output: IcmpOutput, ) -> CodegenResult<IcmpResult>1445 pub(crate) fn lower_icmp<C: LowerCtx<I = Inst>>(
1446     ctx: &mut C,
1447     insn: IRInst,
1448     condcode: IntCC,
1449     output: IcmpOutput,
1450 ) -> CodegenResult<IcmpResult> {
1451     log::trace!(
1452         "lower_icmp: insn {}, condcode: {}, output: {:?}",
1453         insn,
1454         condcode,
1455         output
1456     );
1457 
1458     let rd = output.reg().unwrap_or(writable_zero_reg());
1459     let inputs = insn_inputs(ctx, insn);
1460     let cond = lower_condcode(condcode);
1461     let is_signed = condcode_is_signed(condcode);
1462     let ty = ctx.input_ty(insn, 0);
1463     let bits = ty_bits(ty);
1464     let narrow_mode = match (bits <= 32, is_signed) {
1465         (true, true) => NarrowValueMode::SignExtend32,
1466         (true, false) => NarrowValueMode::ZeroExtend32,
1467         (false, true) => NarrowValueMode::SignExtend64,
1468         (false, false) => NarrowValueMode::ZeroExtend64,
1469     };
1470     let mut should_materialize = output.reg().is_some();
1471 
1472     let out_condcode = if ty == I128 {
1473         let lhs = put_input_in_regs(ctx, inputs[0]);
1474         let rhs = put_input_in_regs(ctx, inputs[1]);
1475 
1476         let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1477         let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1478 
1479         match condcode {
1480             IntCC::Equal | IntCC::NotEqual => {
1481                 // eor     tmp1, lhs_lo, rhs_lo
1482                 // eor     tmp2, lhs_hi, rhs_hi
1483                 // adds    xzr, tmp1, tmp2
1484                 // cset    dst, {eq, ne}
1485 
1486                 ctx.emit(Inst::AluRRR {
1487                     alu_op: ALUOp::Eor64,
1488                     rd: tmp1,
1489                     rn: lhs.regs()[0],
1490                     rm: rhs.regs()[0],
1491                 });
1492                 ctx.emit(Inst::AluRRR {
1493                     alu_op: ALUOp::Eor64,
1494                     rd: tmp2,
1495                     rn: lhs.regs()[1],
1496                     rm: rhs.regs()[1],
1497                 });
1498                 ctx.emit(Inst::AluRRR {
1499                     alu_op: ALUOp::AddS64,
1500                     rd: writable_zero_reg(),
1501                     rn: tmp1.to_reg(),
1502                     rm: tmp2.to_reg(),
1503                 });
1504             }
1505             IntCC::Overflow | IntCC::NotOverflow => {
1506                 // We can do an 128bit add while throwing away the results
1507                 // and check the overflow flags at the end.
1508                 //
1509                 // adds    xzr, lhs_lo, rhs_lo
1510                 // adcs    xzr, lhs_hi, rhs_hi
1511                 // cset    dst, {vs, vc}
1512 
1513                 ctx.emit(Inst::AluRRR {
1514                     alu_op: ALUOp::AddS64,
1515                     rd: writable_zero_reg(),
1516                     rn: lhs.regs()[0],
1517                     rm: rhs.regs()[0],
1518                 });
1519                 ctx.emit(Inst::AluRRR {
1520                     alu_op: ALUOp::AdcS64,
1521                     rd: writable_zero_reg(),
1522                     rn: lhs.regs()[1],
1523                     rm: rhs.regs()[1],
1524                 });
1525             }
1526             _ => {
1527                 // cmp     lhs_lo, rhs_lo
1528                 // cset    tmp1, unsigned_cond
1529                 // cmp     lhs_hi, rhs_hi
1530                 // cset    tmp2, cond
1531                 // csel    dst, tmp1, tmp2, eq
1532 
1533                 let rd = output.reg().unwrap_or(tmp1);
1534                 let unsigned_cond = lower_condcode(condcode.unsigned());
1535 
1536                 ctx.emit(Inst::AluRRR {
1537                     alu_op: ALUOp::SubS64,
1538                     rd: writable_zero_reg(),
1539                     rn: lhs.regs()[0],
1540                     rm: rhs.regs()[0],
1541                 });
1542                 materialize_bool_result(ctx, insn, tmp1, unsigned_cond);
1543                 ctx.emit(Inst::AluRRR {
1544                     alu_op: ALUOp::SubS64,
1545                     rd: writable_zero_reg(),
1546                     rn: lhs.regs()[1],
1547                     rm: rhs.regs()[1],
1548                 });
1549                 materialize_bool_result(ctx, insn, tmp2, cond);
1550                 ctx.emit(Inst::CSel {
1551                     cond: Cond::Eq,
1552                     rd,
1553                     rn: tmp1.to_reg(),
1554                     rm: tmp2.to_reg(),
1555                 });
1556 
1557                 if output == IcmpOutput::CondCode {
1558                     // We only need to guarantee that the flags for `cond` are correct, so we can
1559                     // compare rd with 0 or 1
1560 
1561                     // If we are doing compare or equal, we want to compare with 1 instead of zero
1562                     if condcode.without_equal() != condcode {
1563                         lower_constant_u64(ctx, tmp2, 1);
1564                     }
1565 
1566                     let xzr = zero_reg();
1567                     let rd = rd.to_reg();
1568                     let tmp2 = tmp2.to_reg();
1569                     let (rn, rm) = match condcode {
1570                         IntCC::SignedGreaterThanOrEqual => (rd, tmp2),
1571                         IntCC::UnsignedGreaterThanOrEqual => (rd, tmp2),
1572                         IntCC::SignedLessThanOrEqual => (tmp2, rd),
1573                         IntCC::UnsignedLessThanOrEqual => (tmp2, rd),
1574                         IntCC::SignedGreaterThan => (rd, xzr),
1575                         IntCC::UnsignedGreaterThan => (rd, xzr),
1576                         IntCC::SignedLessThan => (xzr, rd),
1577                         IntCC::UnsignedLessThan => (xzr, rd),
1578                         _ => unreachable!(),
1579                     };
1580 
1581                     ctx.emit(Inst::AluRRR {
1582                         alu_op: ALUOp::SubS64,
1583                         rd: writable_zero_reg(),
1584                         rn,
1585                         rm,
1586                     });
1587                 }
1588 
1589                 // Prevent a second materialize_bool_result to be emitted at the end of the function
1590                 should_materialize = false;
1591             }
1592         }
1593         cond
1594     } else if ty.is_vector() {
1595         assert_ne!(output, IcmpOutput::CondCode);
1596         should_materialize = false;
1597 
1598         let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1599         let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
1600         lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1601         cond
1602     } else {
1603         let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1604         let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
1605 
1606         let is_overflow = condcode == IntCC::Overflow || condcode == IntCC::NotOverflow;
1607         let is_small_type = ty == I8 || ty == I16;
1608         let (cond, rn, rm) = if is_overflow && is_small_type {
1609             // Overflow checks for non native types require additional instructions, other than
1610             // just the extend op.
1611             //
1612             // TODO: Codegen improvements: Merge the second sxt{h,b} into the following sub instruction.
1613             //
1614             // sxt{h,b}  w0, w0
1615             // sxt{h,b}  w1, w1
1616             // sub       w0, w0, w1
1617             // cmp       w0, w0, sxt{h,b}
1618             //
1619             // The result of this comparison is either the EQ or NE condition code, so we need to
1620             // signal that to the caller
1621 
1622             let extend_op = if ty == I8 {
1623                 ExtendOp::SXTB
1624             } else {
1625                 ExtendOp::SXTH
1626             };
1627             let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
1628             ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp1, rn, rm));
1629 
1630             let out_cond = match condcode {
1631                 IntCC::Overflow => Cond::Ne,
1632                 IntCC::NotOverflow => Cond::Eq,
1633                 _ => unreachable!(),
1634             };
1635             (
1636                 out_cond,
1637                 tmp1.to_reg(),
1638                 ResultRSEImm12::RegExtend(tmp1.to_reg(), extend_op),
1639             )
1640         } else {
1641             (cond, rn, rm)
1642         };
1643 
1644         let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
1645         ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
1646         cond
1647     };
1648 
1649     // Most of the comparisons above produce flags by default, if the caller requested the result
1650     // in a register we materialize those flags into a register. Some branches do end up producing
1651     // the result as a register by default, so we ignore those.
1652     if should_materialize {
1653         materialize_bool_result(ctx, insn, rd, cond);
1654     }
1655 
1656     Ok(match output {
1657         // We currently never emit a different register than what was asked for
1658         IcmpOutput::Register(_) => IcmpResult::Register,
1659         IcmpOutput::CondCode => IcmpResult::CondCode(out_condcode),
1660     })
1661 }
1662 
lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst)1663 pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
1664     let ty = ctx.input_ty(insn, 0);
1665     let bits = ty_bits(ty);
1666     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
1667     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1668     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1669     match bits {
1670         32 => {
1671             ctx.emit(Inst::FpuCmp32 { rn, rm });
1672         }
1673         64 => {
1674             ctx.emit(Inst::FpuCmp64 { rn, rm });
1675         }
1676         _ => panic!("Unknown float size"),
1677     }
1678 }
1679 
1680 /// Materialize a boolean value into a register from the flags
1681 /// (e.g set by a comparison).
1682 /// A 0 / -1 (all-ones) result as expected for bool operations.
materialize_bool_result<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, rd: Writable<Reg>, cond: Cond, )1683 pub(crate) fn materialize_bool_result<C: LowerCtx<I = Inst>>(
1684     ctx: &mut C,
1685     insn: IRInst,
1686     rd: Writable<Reg>,
1687     cond: Cond,
1688 ) {
1689     // A boolean is 0 / -1; if output width is > 1 use `csetm`,
1690     // otherwise use `cset`.
1691     if ty_bits(ctx.output_ty(insn, 0)) > 1 {
1692         ctx.emit(Inst::CSetm { rd, cond });
1693     } else {
1694         ctx.emit(Inst::CSet { rd, cond });
1695     }
1696 }
1697 
lower_shift_amt<C: LowerCtx<I = Inst>>( ctx: &mut C, amt_input: InsnInput, dst_ty: Type, tmp_reg: Writable<Reg>, ) -> ResultRegImmShift1698 pub(crate) fn lower_shift_amt<C: LowerCtx<I = Inst>>(
1699     ctx: &mut C,
1700     amt_input: InsnInput,
1701     dst_ty: Type,
1702     tmp_reg: Writable<Reg>,
1703 ) -> ResultRegImmShift {
1704     let amt_ty = ctx.input_ty(amt_input.insn, amt_input.input);
1705 
1706     match (dst_ty, amt_ty) {
1707         // When shifting for amounts larger than the size of the type, the CLIF shift
1708         // instructions implement a "wrapping" behaviour, such that an i8 << 8 is
1709         // equivalent to i8 << 0
1710         //
1711         // On i32 and i64 types this matches what the aarch64 spec does, but on smaller
1712         // types (i16, i8) we need to do this manually, so we wrap the shift amount
1713         // with an AND instruction
1714         (I16 | I8, _) => {
1715             // We can ignore the top half of the shift amount register if the type is I128
1716             let amt_reg = put_input_in_regs(ctx, amt_input).regs()[0];
1717             let mask = (ty_bits(dst_ty) - 1) as u64;
1718             ctx.emit(Inst::AluRRImmLogic {
1719                 alu_op: ALUOp::And32,
1720                 rd: tmp_reg,
1721                 rn: amt_reg,
1722                 imml: ImmLogic::maybe_from_u64(mask, I32).unwrap(),
1723             });
1724             ResultRegImmShift::Reg(tmp_reg.to_reg())
1725         }
1726         // TODO: We can use immlogic for i128 types here
1727         (I128, _) | (_, I128) => {
1728             // For I128 shifts, we need a register without immlogic
1729             ResultRegImmShift::Reg(put_input_in_regs(ctx, amt_input).regs()[0])
1730         }
1731         _ => put_input_in_reg_immshift(ctx, amt_input, ty_bits(dst_ty)),
1732     }
1733 }
1734 
1735 /// This is target-word-size dependent.  And it excludes booleans and reftypes.
is_valid_atomic_transaction_ty(ty: Type) -> bool1736 pub(crate) fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
1737     match ty {
1738         I8 | I16 | I32 | I64 => true,
1739         _ => false,
1740     }
1741 }
1742 
emit_atomic_load<C: LowerCtx<I = Inst>>( ctx: &mut C, rt: Writable<Reg>, insn: IRInst, )1743 pub(crate) fn emit_atomic_load<C: LowerCtx<I = Inst>>(
1744     ctx: &mut C,
1745     rt: Writable<Reg>,
1746     insn: IRInst,
1747 ) {
1748     assert!(ctx.data(insn).opcode() == Opcode::AtomicLoad);
1749     let inputs = insn_inputs(ctx, insn);
1750     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1751     let access_ty = ctx.output_ty(insn, 0);
1752     assert!(is_valid_atomic_transaction_ty(access_ty));
1753     // We're ignoring the result type of the load because the LoadAcquire will
1754     // explicitly zero extend to the nearest word, and also zero the high half
1755     // of an X register.
1756     ctx.emit(Inst::LoadAcquire { access_ty, rt, rn });
1757 }
1758 
load_op_to_ty(op: Opcode) -> Option<Type>1759 fn load_op_to_ty(op: Opcode) -> Option<Type> {
1760     match op {
1761         Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8),
1762         Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => {
1763             Some(I16)
1764         }
1765         Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => {
1766             Some(I32)
1767         }
1768         Opcode::Load | Opcode::LoadComplex => None,
1769         Opcode::Sload8x8 | Opcode::Uload8x8 | Opcode::Sload8x8Complex | Opcode::Uload8x8Complex => {
1770             Some(I8X8)
1771         }
1772         Opcode::Sload16x4
1773         | Opcode::Uload16x4
1774         | Opcode::Sload16x4Complex
1775         | Opcode::Uload16x4Complex => Some(I16X4),
1776         Opcode::Sload32x2
1777         | Opcode::Uload32x2
1778         | Opcode::Sload32x2Complex
1779         | Opcode::Uload32x2Complex => Some(I32X2),
1780         _ => None,
1781     }
1782 }
1783 
1784 /// Helper to lower a load instruction; this is used in several places, because
1785 /// a load can sometimes be merged into another operation.
lower_load< C: LowerCtx<I = Inst>, F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode), >( ctx: &mut C, ir_inst: IRInst, inputs: &[InsnInput], output: InsnOutput, mut f: F, )1786 pub(crate) fn lower_load<
1787     C: LowerCtx<I = Inst>,
1788     F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
1789 >(
1790     ctx: &mut C,
1791     ir_inst: IRInst,
1792     inputs: &[InsnInput],
1793     output: InsnOutput,
1794     mut f: F,
1795 ) {
1796     let op = ctx.data(ir_inst).opcode();
1797 
1798     let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0));
1799 
1800     let off = ctx.data(ir_inst).load_store_offset().unwrap();
1801     let mem = lower_address(ctx, elem_ty, &inputs[..], off);
1802     let rd = get_output_reg(ctx, output);
1803 
1804     f(ctx, rd, elem_ty, mem);
1805 }
1806 
emit_shl_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt: Reg, )1807 pub(crate) fn emit_shl_i128<C: LowerCtx<I = Inst>>(
1808     ctx: &mut C,
1809     src: ValueRegs<Reg>,
1810     dst: ValueRegs<Writable<Reg>>,
1811     amt: Reg,
1812 ) {
1813     let src_lo = src.regs()[0];
1814     let src_hi = src.regs()[1];
1815     let dst_lo = dst.regs()[0];
1816     let dst_hi = dst.regs()[1];
1817 
1818     //     mvn     inv_amt, amt
1819     //     lsr     tmp1, src_lo, #1
1820     //     lsl     tmp2, src_hi, amt
1821     //     lsr     tmp1, tmp1, inv_amt
1822     //     lsl     tmp3, src_lo, amt
1823     //     tst     amt, #0x40
1824     //     orr     tmp2, tmp2, tmp1
1825     //     csel    dst_hi, tmp3, tmp2, ne
1826     //     csel    dst_lo, xzr, tmp3, ne
1827 
1828     let xzr = writable_zero_reg();
1829     let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
1830     let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1831     let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1832     let tmp3 = ctx.alloc_tmp(I64).only_reg().unwrap();
1833 
1834     ctx.emit(Inst::AluRRR {
1835         alu_op: ALUOp::OrrNot32,
1836         rd: inv_amt,
1837         rn: xzr.to_reg(),
1838         rm: amt,
1839     });
1840 
1841     ctx.emit(Inst::AluRRImmShift {
1842         alu_op: ALUOp::Lsr64,
1843         rd: tmp1,
1844         rn: src_lo,
1845         immshift: ImmShift::maybe_from_u64(1).unwrap(),
1846     });
1847 
1848     ctx.emit(Inst::AluRRR {
1849         alu_op: ALUOp::Lsl64,
1850         rd: tmp2,
1851         rn: src_hi,
1852         rm: amt,
1853     });
1854 
1855     ctx.emit(Inst::AluRRR {
1856         alu_op: ALUOp::Lsr64,
1857         rd: tmp1,
1858         rn: tmp1.to_reg(),
1859         rm: inv_amt.to_reg(),
1860     });
1861 
1862     ctx.emit(Inst::AluRRR {
1863         alu_op: ALUOp::Lsl64,
1864         rd: tmp3,
1865         rn: src_lo,
1866         rm: amt,
1867     });
1868 
1869     ctx.emit(Inst::AluRRImmLogic {
1870         alu_op: ALUOp::AndS64,
1871         rd: xzr,
1872         rn: amt,
1873         imml: ImmLogic::maybe_from_u64(64, I64).unwrap(),
1874     });
1875 
1876     ctx.emit(Inst::AluRRR {
1877         alu_op: ALUOp::Orr64,
1878         rd: tmp2,
1879         rn: tmp2.to_reg(),
1880         rm: tmp1.to_reg(),
1881     });
1882 
1883     ctx.emit(Inst::CSel {
1884         cond: Cond::Ne,
1885         rd: dst_hi,
1886         rn: tmp3.to_reg(),
1887         rm: tmp2.to_reg(),
1888     });
1889 
1890     ctx.emit(Inst::CSel {
1891         cond: Cond::Ne,
1892         rd: dst_lo,
1893         rn: xzr.to_reg(),
1894         rm: tmp3.to_reg(),
1895     });
1896 }
1897 
emit_shr_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt: Reg, is_signed: bool, )1898 pub(crate) fn emit_shr_i128<C: LowerCtx<I = Inst>>(
1899     ctx: &mut C,
1900     src: ValueRegs<Reg>,
1901     dst: ValueRegs<Writable<Reg>>,
1902     amt: Reg,
1903     is_signed: bool,
1904 ) {
1905     let src_lo = src.regs()[0];
1906     let src_hi = src.regs()[1];
1907     let dst_lo = dst.regs()[0];
1908     let dst_hi = dst.regs()[1];
1909 
1910     //     mvn       inv_amt, amt
1911     //     lsl       tmp1, src_lo, #1
1912     //     lsr       tmp2, src_hi, amt
1913     //     lsl       tmp1, tmp1, inv_amt
1914     //     lsr/asr   tmp3, src_lo, amt
1915     //     tst       amt, #0x40
1916     //     orr       tmp2, tmp2, tmp1
1917     //
1918     //     if signed:
1919     //         asr     tmp4, src_hi, #63
1920     //         csel    dst_hi, tmp4, tmp3, ne
1921     //     else:
1922     //         csel    dst_hi, xzr, tmp3, ne
1923     //
1924     //     csel      dst_lo, tmp3, tmp2, ne
1925 
1926     let xzr = writable_zero_reg();
1927     let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
1928     let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1929     let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1930     let tmp3 = ctx.alloc_tmp(I64).only_reg().unwrap();
1931     let tmp4 = ctx.alloc_tmp(I64).only_reg().unwrap();
1932 
1933     let shift_op = if is_signed {
1934         ALUOp::Asr64
1935     } else {
1936         ALUOp::Lsr64
1937     };
1938 
1939     ctx.emit(Inst::AluRRR {
1940         alu_op: ALUOp::OrrNot32,
1941         rd: inv_amt,
1942         rn: xzr.to_reg(),
1943         rm: amt,
1944     });
1945 
1946     ctx.emit(Inst::AluRRImmShift {
1947         alu_op: ALUOp::Lsl64,
1948         rd: tmp1,
1949         rn: src_hi,
1950         immshift: ImmShift::maybe_from_u64(1).unwrap(),
1951     });
1952 
1953     ctx.emit(Inst::AluRRR {
1954         alu_op: ALUOp::Lsr64,
1955         rd: tmp2,
1956         rn: src_lo,
1957         rm: amt,
1958     });
1959 
1960     ctx.emit(Inst::AluRRR {
1961         alu_op: ALUOp::Lsl64,
1962         rd: tmp1,
1963         rn: tmp1.to_reg(),
1964         rm: inv_amt.to_reg(),
1965     });
1966 
1967     ctx.emit(Inst::AluRRR {
1968         alu_op: shift_op,
1969         rd: tmp3,
1970         rn: src_hi,
1971         rm: amt,
1972     });
1973 
1974     ctx.emit(Inst::AluRRImmLogic {
1975         alu_op: ALUOp::AndS64,
1976         rd: xzr,
1977         rn: amt,
1978         imml: ImmLogic::maybe_from_u64(64, I64).unwrap(),
1979     });
1980 
1981     if is_signed {
1982         ctx.emit(Inst::AluRRImmShift {
1983             alu_op: ALUOp::Asr64,
1984             rd: tmp4,
1985             rn: src_hi,
1986             immshift: ImmShift::maybe_from_u64(63).unwrap(),
1987         });
1988     }
1989 
1990     ctx.emit(Inst::AluRRR {
1991         alu_op: ALUOp::Orr64,
1992         rd: tmp2,
1993         rn: tmp2.to_reg(),
1994         rm: tmp1.to_reg(),
1995     });
1996 
1997     ctx.emit(Inst::CSel {
1998         cond: Cond::Ne,
1999         rd: dst_hi,
2000         rn: if is_signed { tmp4 } else { xzr }.to_reg(),
2001         rm: tmp3.to_reg(),
2002     });
2003 
2004     ctx.emit(Inst::CSel {
2005         cond: Cond::Ne,
2006         rd: dst_lo,
2007         rn: tmp3.to_reg(),
2008         rm: tmp2.to_reg(),
2009     });
2010 }
2011 
emit_clz_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, )2012 pub(crate) fn emit_clz_i128<C: LowerCtx<I = Inst>>(
2013     ctx: &mut C,
2014     src: ValueRegs<Reg>,
2015     dst: ValueRegs<Writable<Reg>>,
2016 ) {
2017     let src_lo = src.regs()[0];
2018     let src_hi = src.regs()[1];
2019     let dst_lo = dst.regs()[0];
2020     let dst_hi = dst.regs()[1];
2021 
2022     // clz dst_hi, src_hi
2023     // clz dst_lo, src_lo
2024     // lsr tmp, dst_hi, #6
2025     // madd dst_lo, dst_lo, tmp, dst_hi
2026     // mov  dst_hi, 0
2027 
2028     let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
2029 
2030     ctx.emit(Inst::BitRR {
2031         rd: dst_hi,
2032         rn: src_hi,
2033         op: BitOp::Clz64,
2034     });
2035     ctx.emit(Inst::BitRR {
2036         rd: dst_lo,
2037         rn: src_lo,
2038         op: BitOp::Clz64,
2039     });
2040     ctx.emit(Inst::AluRRImmShift {
2041         alu_op: ALUOp::Lsr64,
2042         rd: tmp,
2043         rn: dst_hi.to_reg(),
2044         immshift: ImmShift::maybe_from_u64(6).unwrap(),
2045     });
2046     ctx.emit(Inst::AluRRRR {
2047         alu_op: ALUOp3::MAdd64,
2048         rd: dst_lo,
2049         rn: dst_lo.to_reg(),
2050         rm: tmp.to_reg(),
2051         ra: dst_hi.to_reg(),
2052     });
2053     lower_constant_u64(ctx, dst_hi, 0);
2054 }
2055 
2056 //=============================================================================
2057 // Lowering-backend trait implementation.
2058 
2059 impl LowerBackend for AArch64Backend {
2060     type MInst = Inst;
2061 
lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()>2062     fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
2063         lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.isa_flags)
2064     }
2065 
lower_branch_group<C: LowerCtx<I = Inst>>( &self, ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>2066     fn lower_branch_group<C: LowerCtx<I = Inst>>(
2067         &self,
2068         ctx: &mut C,
2069         branches: &[IRInst],
2070         targets: &[MachLabel],
2071     ) -> CodegenResult<()> {
2072         lower_inst::lower_branch(ctx, branches, targets)
2073     }
2074 
maybe_pinned_reg(&self) -> Option<Reg>2075     fn maybe_pinned_reg(&self) -> Option<Reg> {
2076         Some(xreg(PINNED_REG))
2077     }
2078 }
2079