1 //! Lowering rules for AArch64.
2 //!
3 //! TODO: opportunities for better code generation:
4 //!
5 //! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize
6 //! pre/post-index opportunities.
7 //!
8 //! - Floating-point immediates (FIMM instruction).
9
10 use crate::ir::condcodes::{FloatCC, IntCC};
11 use crate::ir::types::*;
12 use crate::ir::Inst as IRInst;
13 use crate::ir::{Opcode, Type};
14 use crate::machinst::lower::*;
15 use crate::machinst::*;
16 use crate::CodegenResult;
17
18 use crate::isa::aarch64::inst::*;
19 use crate::isa::aarch64::AArch64Backend;
20
21 use super::lower_inst;
22
23 use crate::data_value::DataValue;
24 use regalloc::{Reg, Writable};
25 use smallvec::SmallVec;
26 use std::cmp;
27
28 //============================================================================
29 // Result enum types.
30 //
31 // Lowering of a given value results in one of these enums, depending on the
32 // modes in which we can accept the value.
33
34 /// A lowering result: register, register-shift. An SSA value can always be
35 /// lowered into one of these options; the register form is the fallback.
36 #[derive(Clone, Debug)]
37 enum ResultRS {
38 Reg(Reg),
39 RegShift(Reg, ShiftOpAndAmt),
40 }
41
42 /// A lowering result: register, register-shift, register-extend. An SSA value can always be
43 /// lowered into one of these options; the register form is the fallback.
44 #[derive(Clone, Debug)]
45 enum ResultRSE {
46 Reg(Reg),
47 RegShift(Reg, ShiftOpAndAmt),
48 RegExtend(Reg, ExtendOp),
49 }
50
51 impl ResultRSE {
from_rs(rs: ResultRS) -> ResultRSE52 fn from_rs(rs: ResultRS) -> ResultRSE {
53 match rs {
54 ResultRS::Reg(r) => ResultRSE::Reg(r),
55 ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s),
56 }
57 }
58 }
59
60 /// A lowering result: register, register-shift, register-extend, or 12-bit immediate form.
61 /// An SSA value can always be lowered into one of these options; the register form is the
62 /// fallback.
63 #[derive(Clone, Debug)]
64 pub(crate) enum ResultRSEImm12 {
65 Reg(Reg),
66 RegShift(Reg, ShiftOpAndAmt),
67 RegExtend(Reg, ExtendOp),
68 Imm12(Imm12),
69 }
70
71 impl ResultRSEImm12 {
from_rse(rse: ResultRSE) -> ResultRSEImm1272 fn from_rse(rse: ResultRSE) -> ResultRSEImm12 {
73 match rse {
74 ResultRSE::Reg(r) => ResultRSEImm12::Reg(r),
75 ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s),
76 ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e),
77 }
78 }
79 }
80
81 /// A lowering result: register, register-shift, or logical immediate form.
82 /// An SSA value can always be lowered into one of these options; the register form is the
83 /// fallback.
84 #[derive(Clone, Debug)]
85 pub(crate) enum ResultRSImmLogic {
86 Reg(Reg),
87 RegShift(Reg, ShiftOpAndAmt),
88 ImmLogic(ImmLogic),
89 }
90
91 impl ResultRSImmLogic {
from_rs(rse: ResultRS) -> ResultRSImmLogic92 fn from_rs(rse: ResultRS) -> ResultRSImmLogic {
93 match rse {
94 ResultRS::Reg(r) => ResultRSImmLogic::Reg(r),
95 ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s),
96 }
97 }
98 }
99
100 /// A lowering result: register or immediate shift amount (arg to a shift op).
101 /// An SSA value can always be lowered into one of these options; the register form is the
102 /// fallback.
103 #[derive(Clone, Debug)]
104 pub(crate) enum ResultRegImmShift {
105 Reg(Reg),
106 ImmShift(ImmShift),
107 }
108
109 impl ResultRegImmShift {
unwrap_reg(self) -> Reg110 pub fn unwrap_reg(self) -> Reg {
111 match self {
112 ResultRegImmShift::Reg(r) => r,
113 _ => panic!("Unwrapped ResultRegImmShift, expected reg, got: {:?}", self),
114 }
115 }
116 }
117
118 //============================================================================
119 // Lowering: convert instruction inputs to forms that we can use.
120
121 /// Lower an instruction input to a 64-bit constant, if possible.
input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64>122 pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
123 let input = ctx.get_input_as_source_or_const(input.insn, input.input);
124 input.constant
125 }
126
127 /// Lower an instruction input to a constant register-shift amount, if possible.
input_to_shiftimm<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> Option<ShiftOpShiftImm>128 pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
129 ctx: &mut C,
130 input: InsnInput,
131 ) -> Option<ShiftOpShiftImm> {
132 input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
133 }
134
const_param_to_u128<C: LowerCtx<I = Inst>>( ctx: &mut C, inst: IRInst, ) -> Option<u128>135 pub(crate) fn const_param_to_u128<C: LowerCtx<I = Inst>>(
136 ctx: &mut C,
137 inst: IRInst,
138 ) -> Option<u128> {
139 match ctx.get_immediate(inst) {
140 Some(DataValue::V128(bytes)) => Some(u128::from_le_bytes(bytes)),
141 _ => None,
142 }
143 }
144
145 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
146 /// parameter to `put_input_in_*` below.
147 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
148 pub(crate) enum NarrowValueMode {
149 None,
150 /// Zero-extend to 32 bits if original is < 32 bits.
151 ZeroExtend32,
152 /// Sign-extend to 32 bits if original is < 32 bits.
153 SignExtend32,
154 /// Zero-extend to 64 bits if original is < 64 bits.
155 ZeroExtend64,
156 /// Sign-extend to 64 bits if original is < 64 bits.
157 SignExtend64,
158 }
159
160 impl NarrowValueMode {
is_32bit(&self) -> bool161 fn is_32bit(&self) -> bool {
162 match self {
163 NarrowValueMode::None => false,
164 NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true,
165 NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false,
166 }
167 }
168
is_signed(&self) -> bool169 fn is_signed(&self) -> bool {
170 match self {
171 NarrowValueMode::SignExtend32 | NarrowValueMode::SignExtend64 => true,
172 NarrowValueMode::ZeroExtend32 | NarrowValueMode::ZeroExtend64 => false,
173 NarrowValueMode::None => false,
174 }
175 }
176 }
177
178 /// Emits instruction(s) to generate the given constant value into newly-allocated
179 /// temporary registers, returning these registers.
generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg>180 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg> {
181 let from_bits = ty_bits(ty);
182 let masked = if from_bits < 128 {
183 c & ((1u128 << from_bits) - 1)
184 } else {
185 c
186 };
187
188 let cst_copy = ctx.alloc_tmp(ty);
189 for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| {
190 ctx.alloc_tmp(ty).only_reg().unwrap()
191 })
192 .into_iter()
193 {
194 ctx.emit(inst);
195 }
196 non_writable_value_regs(cst_copy)
197 }
198
199 /// Extends a register according to `narrow_mode`.
200 /// If extended, the value is always extended to 64 bits, for simplicity.
extend_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, ty: Type, in_reg: Reg, is_const: bool, narrow_mode: NarrowValueMode, ) -> Reg201 fn extend_reg<C: LowerCtx<I = Inst>>(
202 ctx: &mut C,
203 ty: Type,
204 in_reg: Reg,
205 is_const: bool,
206 narrow_mode: NarrowValueMode,
207 ) -> Reg {
208 let from_bits = ty_bits(ty) as u8;
209 match (narrow_mode, from_bits) {
210 (NarrowValueMode::None, _) => in_reg,
211 (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
212 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
213 ctx.emit(Inst::Extend {
214 rd: tmp,
215 rn: in_reg,
216 signed: false,
217 from_bits,
218 to_bits: 32,
219 });
220 tmp.to_reg()
221 }
222 (NarrowValueMode::SignExtend32, n) if n < 32 => {
223 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
224 ctx.emit(Inst::Extend {
225 rd: tmp,
226 rn: in_reg,
227 signed: true,
228 from_bits,
229 to_bits: 32,
230 });
231 tmp.to_reg()
232 }
233 (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
234
235 (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
236 if is_const {
237 // Constants are zero-extended to full 64-bit width on load already.
238 in_reg
239 } else {
240 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
241 ctx.emit(Inst::Extend {
242 rd: tmp,
243 rn: in_reg,
244 signed: false,
245 from_bits,
246 to_bits: 64,
247 });
248 tmp.to_reg()
249 }
250 }
251 (NarrowValueMode::SignExtend64, n) if n < 64 => {
252 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
253 ctx.emit(Inst::Extend {
254 rd: tmp,
255 rn: in_reg,
256 signed: true,
257 from_bits,
258 to_bits: 64,
259 });
260 tmp.to_reg()
261 }
262 (_, 64) => in_reg,
263 (_, 128) => in_reg,
264
265 _ => panic!(
266 "Unsupported input width: input ty {} bits {} mode {:?}",
267 ty, from_bits, narrow_mode
268 ),
269 }
270 }
271
272 /// Lowers an instruction input to multiple regs
lower_input_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> (ValueRegs<Reg>, Type, bool)273 fn lower_input_to_regs<C: LowerCtx<I = Inst>>(
274 ctx: &mut C,
275 input: InsnInput,
276 ) -> (ValueRegs<Reg>, Type, bool) {
277 log::trace!("lower_input_to_regs: input {:?}", input);
278 let ty = ctx.input_ty(input.insn, input.input);
279 let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
280 let is_const = inputs.constant.is_some();
281
282 let in_regs = if let Some(c) = inputs.constant {
283 // Generate constants fresh at each use to minimize long-range register pressure.
284 generate_constant(ctx, ty, c as u128)
285 } else {
286 ctx.put_input_in_regs(input.insn, input.input)
287 };
288
289 (in_regs, ty, is_const)
290 }
291
292 /// Lower an instruction input to a register
293 ///
294 /// The given register will be extended appropriately, according to
295 /// `narrow_mode` and the input's type. If extended, the value is
296 /// always extended to 64 bits, for simplicity.
put_input_in_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> Reg297 pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
298 ctx: &mut C,
299 input: InsnInput,
300 narrow_mode: NarrowValueMode,
301 ) -> Reg {
302 let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input);
303 let reg = in_regs
304 .only_reg()
305 .expect("Multi-register value not expected");
306
307 extend_reg(ctx, ty, reg, is_const, narrow_mode)
308 }
309
310 /// Lower an instruction input to multiple regs
put_input_in_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ) -> ValueRegs<Reg>311 pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
312 ctx: &mut C,
313 input: InsnInput,
314 ) -> ValueRegs<Reg> {
315 let (in_regs, _, _) = lower_input_to_regs(ctx, input);
316 in_regs
317 }
318
319 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
320 ///
321 /// The `narrow_mode` flag indicates whether the consumer of this value needs
322 /// the high bits clear. For many operations, such as an add/sub/mul or any
323 /// bitwise logical operation, the low-bit results depend only on the low-bit
324 /// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit
325 /// value is stored in the low 8 bits of the register and the high 24 bits are
326 /// undefined. If the op truly needs the high N bits clear (such as for a
327 /// divide or a right-shift or a compare-to-zero), `narrow_mode` should be
328 /// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting
329 /// register will be provided the extended value.
put_input_in_rs<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRS330 fn put_input_in_rs<C: LowerCtx<I = Inst>>(
331 ctx: &mut C,
332 input: InsnInput,
333 narrow_mode: NarrowValueMode,
334 ) -> ResultRS {
335 let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
336 if let Some((insn, 0)) = inputs.inst {
337 let op = ctx.data(insn).opcode();
338
339 if op == Opcode::Ishl {
340 let shiftee = InsnInput { insn, input: 0 };
341 let shift_amt = InsnInput { insn, input: 1 };
342
343 // Can we get the shift amount as an immediate?
344 if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
345 let shiftee_bits = ty_bits(ctx.input_ty(insn, 0));
346 if shiftee_bits <= std::u8::MAX as usize {
347 let shiftimm = shiftimm.mask(shiftee_bits as u8);
348 let reg = put_input_in_reg(ctx, shiftee, narrow_mode);
349 return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
350 }
351 }
352 }
353 }
354
355 ResultRS::Reg(put_input_in_reg(ctx, input, narrow_mode))
356 }
357
358 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
359 /// This does not actually codegen the source instruction; it just uses the
360 /// vreg into which the source instruction will generate its value.
361 ///
362 /// See note on `put_input_in_rs` for a description of `narrow_mode`.
put_input_in_rse<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSE363 fn put_input_in_rse<C: LowerCtx<I = Inst>>(
364 ctx: &mut C,
365 input: InsnInput,
366 narrow_mode: NarrowValueMode,
367 ) -> ResultRSE {
368 let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
369 if let Some((insn, 0)) = inputs.inst {
370 let op = ctx.data(insn).opcode();
371 let out_ty = ctx.output_ty(insn, 0);
372 let out_bits = ty_bits(out_ty);
373
374 // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
375 if op == Opcode::Uextend || op == Opcode::Sextend {
376 let sign_extend = op == Opcode::Sextend;
377 let inner_ty = ctx.input_ty(insn, 0);
378 let inner_bits = ty_bits(inner_ty);
379 assert!(inner_bits < out_bits);
380 if match (sign_extend, narrow_mode) {
381 // A single zero-extend or sign-extend is equal to itself.
382 (_, NarrowValueMode::None) => true,
383 // Two zero-extends or sign-extends in a row is equal to a single zero-extend or sign-extend.
384 (false, NarrowValueMode::ZeroExtend32) | (false, NarrowValueMode::ZeroExtend64) => {
385 true
386 }
387 (true, NarrowValueMode::SignExtend32) | (true, NarrowValueMode::SignExtend64) => {
388 true
389 }
390 // A zero-extend and a sign-extend in a row is not equal to a single zero-extend or sign-extend
391 (false, NarrowValueMode::SignExtend32) | (false, NarrowValueMode::SignExtend64) => {
392 false
393 }
394 (true, NarrowValueMode::ZeroExtend32) | (true, NarrowValueMode::ZeroExtend64) => {
395 false
396 }
397 } {
398 let extendop = match (sign_extend, inner_bits) {
399 (true, 8) => ExtendOp::SXTB,
400 (false, 8) => ExtendOp::UXTB,
401 (true, 16) => ExtendOp::SXTH,
402 (false, 16) => ExtendOp::UXTH,
403 (true, 32) => ExtendOp::SXTW,
404 (false, 32) => ExtendOp::UXTW,
405 _ => unreachable!(),
406 };
407 let reg =
408 put_input_in_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
409 return ResultRSE::RegExtend(reg, extendop);
410 }
411 }
412
413 // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
414 // then get the result into a register and return an Extend-mode operand on
415 // that register.
416 if narrow_mode != NarrowValueMode::None
417 && ((narrow_mode.is_32bit() && out_bits < 32)
418 || (!narrow_mode.is_32bit() && out_bits < 64))
419 {
420 let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
421 let extendop = match (narrow_mode, out_bits) {
422 (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
423 ExtendOp::SXTB
424 }
425 (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => {
426 ExtendOp::UXTB
427 }
428 (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => {
429 ExtendOp::SXTB
430 }
431 (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => {
432 ExtendOp::UXTB
433 }
434 (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => {
435 ExtendOp::SXTH
436 }
437 (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => {
438 ExtendOp::UXTH
439 }
440 (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW,
441 (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
442 _ => unreachable!(),
443 };
444 return ResultRSE::RegExtend(reg, extendop);
445 }
446 }
447
448 ResultRSE::from_rs(put_input_in_rs(ctx, input, narrow_mode))
449 }
450
put_input_in_rse_imm12<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSEImm12451 pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>(
452 ctx: &mut C,
453 input: InsnInput,
454 narrow_mode: NarrowValueMode,
455 ) -> ResultRSEImm12 {
456 if let Some(imm_value) = input_to_const(ctx, input) {
457 if let Some(i) = Imm12::maybe_from_u64(imm_value) {
458 let out_ty_bits = ty_bits(ctx.input_ty(input.insn, input.input));
459 let is_negative = (i.bits as u64) & (1 << (cmp::max(out_ty_bits, 1) - 1)) != 0;
460
461 // This condition can happen if we matched a value that overflows the output type of
462 // its `iconst` when viewed as a signed value (i.e. iconst.i8 200).
463 // When that happens we need to lower as a negative value, which we cannot do here.
464 if !(narrow_mode.is_signed() && is_negative) {
465 return ResultRSEImm12::Imm12(i);
466 }
467 }
468 }
469
470 ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode))
471 }
472
473 /// Like `put_input_in_rse_imm12` above, except is allowed to negate the
474 /// argument (assuming a two's-complement representation with the given bit
475 /// width) if this allows use of 12-bit immediate. Used to flip `add`s with
476 /// negative immediates to `sub`s (and vice-versa).
put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, twos_complement_bits: usize, narrow_mode: NarrowValueMode, ) -> (ResultRSEImm12, bool)477 pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>(
478 ctx: &mut C,
479 input: InsnInput,
480 twos_complement_bits: usize,
481 narrow_mode: NarrowValueMode,
482 ) -> (ResultRSEImm12, bool) {
483 assert!(twos_complement_bits <= 64);
484 if let Some(imm_value) = input_to_const(ctx, input) {
485 if let Some(i) = Imm12::maybe_from_u64(imm_value) {
486 return (ResultRSEImm12::Imm12(i), false);
487 }
488 let sign_extended =
489 ((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits);
490 let inverted = sign_extended.wrapping_neg();
491 if let Some(i) = Imm12::maybe_from_u64(inverted as u64) {
492 return (ResultRSEImm12::Imm12(i), true);
493 }
494 }
495
496 (
497 ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)),
498 false,
499 )
500 }
501
put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSImmLogic502 pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>(
503 ctx: &mut C,
504 input: InsnInput,
505 narrow_mode: NarrowValueMode,
506 ) -> ResultRSImmLogic {
507 if let Some(imm_value) = input_to_const(ctx, input) {
508 let ty = ctx.input_ty(input.insn, input.input);
509 let ty = if ty_bits(ty) < 32 { I32 } else { ty };
510 if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
511 return ResultRSImmLogic::ImmLogic(i);
512 }
513 }
514
515 ResultRSImmLogic::from_rs(put_input_in_rs(ctx, input, narrow_mode))
516 }
517
put_input_in_reg_immshift<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, shift_width_bits: usize, ) -> ResultRegImmShift518 pub(crate) fn put_input_in_reg_immshift<C: LowerCtx<I = Inst>>(
519 ctx: &mut C,
520 input: InsnInput,
521 shift_width_bits: usize,
522 ) -> ResultRegImmShift {
523 if let Some(imm_value) = input_to_const(ctx, input) {
524 let imm_value = imm_value & ((shift_width_bits - 1) as u64);
525 if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
526 return ResultRegImmShift::ImmShift(immshift);
527 }
528 }
529
530 ResultRegImmShift::Reg(put_input_in_reg(ctx, input, NarrowValueMode::None))
531 }
532
533 //============================================================================
534 // ALU instruction constructors.
535
alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst536 pub(crate) fn alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst {
537 match rm {
538 ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 {
539 alu_op: op,
540 rd,
541 rn,
542 imm12,
543 },
544 ResultRSEImm12::Reg(rm) => Inst::AluRRR {
545 alu_op: op,
546 rd,
547 rn,
548 rm,
549 },
550 ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift {
551 alu_op: op,
552 rd,
553 rn,
554 rm,
555 shiftop,
556 },
557 ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend {
558 alu_op: op,
559 rd,
560 rn,
561 rm,
562 extendop,
563 },
564 }
565 }
566
alu_inst_immlogic( op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSImmLogic, ) -> Inst567 pub(crate) fn alu_inst_immlogic(
568 op: ALUOp,
569 rd: Writable<Reg>,
570 rn: Reg,
571 rm: ResultRSImmLogic,
572 ) -> Inst {
573 match rm {
574 ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic {
575 alu_op: op,
576 rd,
577 rn,
578 imml,
579 },
580 ResultRSImmLogic::Reg(rm) => Inst::AluRRR {
581 alu_op: op,
582 rd,
583 rn,
584 rm,
585 },
586 ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift {
587 alu_op: op,
588 rd,
589 rn,
590 rm,
591 shiftop,
592 },
593 }
594 }
595
alu_inst_immshift( op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRegImmShift, ) -> Inst596 pub(crate) fn alu_inst_immshift(
597 op: ALUOp,
598 rd: Writable<Reg>,
599 rn: Reg,
600 rm: ResultRegImmShift,
601 ) -> Inst {
602 match rm {
603 ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift {
604 alu_op: op,
605 rd,
606 rn,
607 immshift,
608 },
609 ResultRegImmShift::Reg(rm) => Inst::AluRRR {
610 alu_op: op,
611 rd,
612 rn,
613 rm,
614 },
615 }
616 }
617
618 //============================================================================
619 // Lowering: addressing mode support. Takes instruction directly, rather
620 // than an `InsnInput`, to do more introspection.
621
622 /// 32-bit addends that make up an address: an input, and an extension mode on that
623 /// input.
624 type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>;
625 /// 64-bit addends that make up an address: just an input.
626 type AddressAddend64List = SmallVec<[Reg; 4]>;
627
628 /// Collect all addends that feed into an address computation, with extend-modes
629 /// on each. Note that a load/store may have multiple address components (and
630 /// the CLIF semantics are that these components are added to form the final
631 /// address), but sometimes the CLIF that we receive still has arguments that
632 /// refer to `iadd` instructions. We also want to handle uextend/sextend below
633 /// the add(s).
634 ///
635 /// We match any 64-bit add (and descend into its inputs), and we match any
636 /// 32-to-64-bit sign or zero extension. The returned addend-list will use
637 /// NarrowValueMode values to indicate how to extend each input:
638 ///
639 /// - NarrowValueMode::None: the associated input is 64 bits wide; no extend.
640 /// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide;
641 /// do a sign-extension.
642 /// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide;
643 /// do a zero-extension.
644 ///
645 /// We do not descend further into the inputs of extensions (unless it is a constant),
646 /// because supporting (e.g.) a 32-bit add that is later extended would require
647 /// additional masking of high-order bits, which is too complex. So, in essence, we
648 /// descend any number of adds from the roots, collecting all 64-bit address addends;
649 /// then possibly support extensions at these leaves.
collect_address_addends<C: LowerCtx<I = Inst>>( ctx: &mut C, roots: &[InsnInput], ) -> (AddressAddend64List, AddressAddend32List, i64)650 fn collect_address_addends<C: LowerCtx<I = Inst>>(
651 ctx: &mut C,
652 roots: &[InsnInput],
653 ) -> (AddressAddend64List, AddressAddend32List, i64) {
654 let mut result32: AddressAddend32List = SmallVec::new();
655 let mut result64: AddressAddend64List = SmallVec::new();
656 let mut offset: i64 = 0;
657
658 let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
659
660 while let Some(input) = workqueue.pop() {
661 debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
662 if let Some((op, insn)) = maybe_input_insn_multi(
663 ctx,
664 input,
665 &[
666 Opcode::Uextend,
667 Opcode::Sextend,
668 Opcode::Iadd,
669 Opcode::Iconst,
670 ],
671 ) {
672 match op {
673 Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => {
674 let extendop = if op == Opcode::Uextend {
675 ExtendOp::UXTW
676 } else {
677 ExtendOp::SXTW
678 };
679 let extendee_input = InsnInput { insn, input: 0 };
680 // If the input is a zero-extension of a constant, add the value to the known
681 // offset.
682 // Only do this for zero-extension, as generating a sign-extended
683 // constant may be more instructions than using the 'SXTW' addressing mode.
684 if let (Some(insn), ExtendOp::UXTW) = (
685 maybe_input_insn(ctx, extendee_input, Opcode::Iconst),
686 extendop,
687 ) {
688 let value = (ctx.get_constant(insn).unwrap() & 0xFFFF_FFFF_u64) as i64;
689 offset += value;
690 } else {
691 let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None);
692 result32.push((reg, extendop));
693 }
694 }
695 Opcode::Uextend | Opcode::Sextend => {
696 let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
697 result64.push(reg);
698 }
699 Opcode::Iadd => {
700 for input in 0..ctx.num_inputs(insn) {
701 let addend = InsnInput { insn, input };
702 workqueue.push(addend);
703 }
704 }
705 Opcode::Iconst => {
706 let value: i64 = ctx.get_constant(insn).unwrap() as i64;
707 offset += value;
708 }
709 _ => panic!("Unexpected opcode from maybe_input_insn_multi"),
710 }
711 } else {
712 let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
713 result64.push(reg);
714 }
715 }
716
717 (result64, result32, offset)
718 }
719
720 /// Lower the address of a pair load or store.
lower_pair_address<C: LowerCtx<I = Inst>>( ctx: &mut C, roots: &[InsnInput], offset: i32, ) -> PairAMode721 pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
722 ctx: &mut C,
723 roots: &[InsnInput],
724 offset: i32,
725 ) -> PairAMode {
726 // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
727 // extends and addition ops. We update these as we consume address
728 // components, so they represent the remaining addends not yet handled.
729 let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
730 let offset = args_offset + (offset as i64);
731
732 log::trace!(
733 "lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
734 addends64,
735 addends32,
736 offset
737 );
738
739 // Pairs basically only have reg + imm formats so we only have to worry about those
740
741 let base_reg = if let Some(reg64) = addends64.pop() {
742 reg64
743 } else if let Some((reg32, extendop)) = addends32.pop() {
744 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
745 let signed = match extendop {
746 ExtendOp::SXTW => true,
747 ExtendOp::UXTW => false,
748 _ => unreachable!(),
749 };
750 ctx.emit(Inst::Extend {
751 rd: tmp,
752 rn: reg32,
753 signed,
754 from_bits: 32,
755 to_bits: 64,
756 });
757 tmp.to_reg()
758 } else {
759 zero_reg()
760 };
761
762 let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
763 ctx.emit(Inst::gen_move(addr, base_reg, I64));
764
765 // We have the base register, if we have any others, we need to add them
766 lower_add_addends(ctx, addr, addends64, addends32);
767
768 // Figure out what offset we should emit
769 let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| {
770 lower_add_immediate(ctx, addr, addr.to_reg(), offset);
771 SImm7Scaled::maybe_from_i64(0, I64).unwrap()
772 });
773
774 PairAMode::SignedOffset(addr.to_reg(), imm7)
775 }
776
777 /// Lower the address of a load or store.
lower_address<C: LowerCtx<I = Inst>>( ctx: &mut C, elem_ty: Type, roots: &[InsnInput], offset: i32, ) -> AMode778 pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
779 ctx: &mut C,
780 elem_ty: Type,
781 roots: &[InsnInput],
782 offset: i32,
783 ) -> AMode {
784 // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
785 // mul instructions (Load/StoreComplex don't include scale factors).
786
787 // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
788 // extends and addition ops. We update these as we consume address
789 // components, so they represent the remaining addends not yet handled.
790 let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
791 let mut offset = args_offset + (offset as i64);
792
793 log::trace!(
794 "lower_address: addends64 {:?}, addends32 {:?}, offset {}",
795 addends64,
796 addends32,
797 offset
798 );
799
800 // First, decide what the `AMode` will be. Take one extendee and one 64-bit
801 // reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension,
802 // or some other combination as appropriate.
803 let memarg = if addends64.len() > 0 {
804 if addends32.len() > 0 {
805 let (reg32, extendop) = addends32.pop().unwrap();
806 let reg64 = addends64.pop().unwrap();
807 AMode::RegExtended(reg64, reg32, extendop)
808 } else if offset > 0 && offset < 0x1000 {
809 let reg64 = addends64.pop().unwrap();
810 let off = offset;
811 offset = 0;
812 AMode::RegOffset(reg64, off, elem_ty)
813 } else if addends64.len() >= 2 {
814 let reg1 = addends64.pop().unwrap();
815 let reg2 = addends64.pop().unwrap();
816 AMode::RegReg(reg1, reg2)
817 } else {
818 let reg1 = addends64.pop().unwrap();
819 AMode::reg(reg1)
820 }
821 } else
822 /* addends64.len() == 0 */
823 {
824 if addends32.len() > 0 {
825 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
826 let (reg1, extendop) = addends32.pop().unwrap();
827 let signed = match extendop {
828 ExtendOp::SXTW => true,
829 ExtendOp::UXTW => false,
830 _ => unreachable!(),
831 };
832 ctx.emit(Inst::Extend {
833 rd: tmp,
834 rn: reg1,
835 signed,
836 from_bits: 32,
837 to_bits: 64,
838 });
839 if let Some((reg2, extendop)) = addends32.pop() {
840 AMode::RegExtended(tmp.to_reg(), reg2, extendop)
841 } else {
842 AMode::reg(tmp.to_reg())
843 }
844 } else
845 /* addends32.len() == 0 */
846 {
847 let off_reg = ctx.alloc_tmp(I64).only_reg().unwrap();
848 lower_constant_u64(ctx, off_reg, offset as u64);
849 offset = 0;
850 AMode::reg(off_reg.to_reg())
851 }
852 };
853
854 // At this point, if we have any remaining components, we need to allocate a
855 // temp, replace one of the registers in the AMode with the temp, and emit
856 // instructions to add together the remaining components. Return immediately
857 // if this is *not* the case.
858 if offset == 0 && addends32.len() == 0 && addends64.len() == 0 {
859 return memarg;
860 }
861
862 // Allocate the temp and shoehorn it into the AMode.
863 let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
864 let (reg, memarg) = match memarg {
865 AMode::RegExtended(r1, r2, extendop) => {
866 (r1, AMode::RegExtended(addr.to_reg(), r2, extendop))
867 }
868 AMode::RegOffset(r, off, ty) => (r, AMode::RegOffset(addr.to_reg(), off, ty)),
869 AMode::RegReg(r1, r2) => (r2, AMode::RegReg(addr.to_reg(), r1)),
870 AMode::UnsignedOffset(r, imm) => (r, AMode::UnsignedOffset(addr.to_reg(), imm)),
871 _ => unreachable!(),
872 };
873
874 // If there is any offset, load that first into `addr`, and add the `reg`
875 // that we kicked out of the `AMode`; otherwise, start with that reg.
876 if offset != 0 {
877 lower_add_immediate(ctx, addr, reg, offset)
878 } else {
879 ctx.emit(Inst::gen_move(addr, reg, I64));
880 }
881
882 // Now handle reg64 and reg32-extended components.
883 lower_add_addends(ctx, addr, addends64, addends32);
884
885 memarg
886 }
887
lower_add_addends<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, addends64: AddressAddend64List, addends32: AddressAddend32List, )888 fn lower_add_addends<C: LowerCtx<I = Inst>>(
889 ctx: &mut C,
890 rd: Writable<Reg>,
891 addends64: AddressAddend64List,
892 addends32: AddressAddend32List,
893 ) {
894 for reg in addends64 {
895 // If the register is the stack reg, we must move it to another reg
896 // before adding it.
897 let reg = if reg == stack_reg() {
898 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
899 ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
900 tmp.to_reg()
901 } else {
902 reg
903 };
904 ctx.emit(Inst::AluRRR {
905 alu_op: ALUOp::Add64,
906 rd,
907 rn: rd.to_reg(),
908 rm: reg,
909 });
910 }
911 for (reg, extendop) in addends32 {
912 assert!(reg != stack_reg());
913 ctx.emit(Inst::AluRRRExtend {
914 alu_op: ALUOp::Add64,
915 rd,
916 rn: rd.to_reg(),
917 rm: reg,
918 extendop,
919 });
920 }
921 }
922
923 /// Adds into `rd` a signed imm pattern matching the best instruction for it.
924 // TODO: This function is duplicated in ctx.gen_add_imm
lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64)925 fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
926 // If we can fit offset or -offset in an imm12, use an add-imm
927 // Otherwise, lower the constant first then add.
928 if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
929 ctx.emit(Inst::AluRRImm12 {
930 alu_op: ALUOp::Add64,
931 rd: dst,
932 rn: src,
933 imm12,
934 });
935 } else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
936 ctx.emit(Inst::AluRRImm12 {
937 alu_op: ALUOp::Sub64,
938 rd: dst,
939 rn: src,
940 imm12,
941 });
942 } else {
943 lower_constant_u64(ctx, dst, imm as u64);
944 ctx.emit(Inst::AluRRR {
945 alu_op: ALUOp::Add64,
946 rd: dst,
947 rn: dst.to_reg(),
948 rm: src,
949 });
950 }
951 }
952
lower_constant_u64<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u64, )953 pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
954 ctx: &mut C,
955 rd: Writable<Reg>,
956 value: u64,
957 ) {
958 for inst in Inst::load_constant(rd, value) {
959 ctx.emit(inst);
960 }
961 }
962
lower_constant_f32<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: f32, )963 pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
964 ctx: &mut C,
965 rd: Writable<Reg>,
966 value: f32,
967 ) {
968 let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
969
970 for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
971 ctx.emit(inst);
972 }
973 }
974
lower_constant_f64<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: f64, )975 pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
976 ctx: &mut C,
977 rd: Writable<Reg>,
978 value: f64,
979 ) {
980 let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
981
982 for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
983 ctx.emit(inst);
984 }
985 }
986
lower_constant_f128<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u128, )987 pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
988 ctx: &mut C,
989 rd: Writable<Reg>,
990 value: u128,
991 ) {
992 if value == 0 {
993 // Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`,
994 // is potentially expensive.
995 ctx.emit(Inst::VecDupImm {
996 rd,
997 imm: ASIMDMovModImm::zero(ScalarSize::Size8),
998 invert: false,
999 size: VectorSize::Size8x16,
1000 });
1001 } else {
1002 let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
1003 for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
1004 ctx.emit(inst);
1005 }
1006 }
1007 }
1008
lower_splat_const<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, value: u64, size: VectorSize, )1009 pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
1010 ctx: &mut C,
1011 rd: Writable<Reg>,
1012 value: u64,
1013 size: VectorSize,
1014 ) {
1015 let (value, narrow_size) = match size.lane_size() {
1016 ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
1017 ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
1018 ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
1019 ScalarSize::Size64 => (value, ScalarSize::Size32),
1020 _ => unreachable!(),
1021 };
1022 let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
1023 Some((value, lane_size)) => (
1024 value,
1025 VectorSize::from_lane_size(lane_size, size.is_128bits()),
1026 ),
1027 None => (value, size),
1028 };
1029 let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
1030
1031 for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
1032 ctx.emit(inst);
1033 }
1034 }
1035
lower_condcode(cc: IntCC) -> Cond1036 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
1037 match cc {
1038 IntCC::Equal => Cond::Eq,
1039 IntCC::NotEqual => Cond::Ne,
1040 IntCC::SignedGreaterThanOrEqual => Cond::Ge,
1041 IntCC::SignedGreaterThan => Cond::Gt,
1042 IntCC::SignedLessThanOrEqual => Cond::Le,
1043 IntCC::SignedLessThan => Cond::Lt,
1044 IntCC::UnsignedGreaterThanOrEqual => Cond::Hs,
1045 IntCC::UnsignedGreaterThan => Cond::Hi,
1046 IntCC::UnsignedLessThanOrEqual => Cond::Ls,
1047 IntCC::UnsignedLessThan => Cond::Lo,
1048 IntCC::Overflow => Cond::Vs,
1049 IntCC::NotOverflow => Cond::Vc,
1050 }
1051 }
1052
lower_fp_condcode(cc: FloatCC) -> Cond1053 pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond {
1054 // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs.
1055 // The FCMP instruction sets:
1056 // NZCV
1057 // - PCSR.NZCV = 0011 on UN (unordered),
1058 // 0110 on EQ,
1059 // 1000 on LT,
1060 // 0010 on GT.
1061 match cc {
1062 // EQ | LT | GT. Vc => V clear.
1063 FloatCC::Ordered => Cond::Vc,
1064 // UN. Vs => V set.
1065 FloatCC::Unordered => Cond::Vs,
1066 // EQ. Eq => Z set.
1067 FloatCC::Equal => Cond::Eq,
1068 // UN | LT | GT. Ne => Z clear.
1069 FloatCC::NotEqual => Cond::Ne,
1070 // LT | GT.
1071 FloatCC::OrderedNotEqual => unimplemented!(),
1072 // UN | EQ
1073 FloatCC::UnorderedOrEqual => unimplemented!(),
1074 // LT. Mi => N set.
1075 FloatCC::LessThan => Cond::Mi,
1076 // LT | EQ. Ls => C clear or Z set.
1077 FloatCC::LessThanOrEqual => Cond::Ls,
1078 // GT. Gt => Z clear, N = V.
1079 FloatCC::GreaterThan => Cond::Gt,
1080 // GT | EQ. Ge => N = V.
1081 FloatCC::GreaterThanOrEqual => Cond::Ge,
1082 // UN | LT
1083 FloatCC::UnorderedOrLessThan => unimplemented!(),
1084 // UN | LT | EQ
1085 FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(),
1086 // UN | GT
1087 FloatCC::UnorderedOrGreaterThan => unimplemented!(),
1088 // UN | GT | EQ
1089 FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(),
1090 }
1091 }
1092
lower_vector_compare<C: LowerCtx<I = Inst>>( ctx: &mut C, rd: Writable<Reg>, mut rn: Reg, mut rm: Reg, ty: Type, cond: Cond, ) -> CodegenResult<()>1093 pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
1094 ctx: &mut C,
1095 rd: Writable<Reg>,
1096 mut rn: Reg,
1097 mut rm: Reg,
1098 ty: Type,
1099 cond: Cond,
1100 ) -> CodegenResult<()> {
1101 let is_float = match ty {
1102 F32X4 | F64X2 => true,
1103 _ => false,
1104 };
1105 let size = VectorSize::from_ty(ty);
1106 // 'Less than' operations are implemented by swapping
1107 // the order of operands and using the 'greater than'
1108 // instructions.
1109 // 'Not equal' is implemented with 'equal' and inverting
1110 // the result.
1111 let (alu_op, swap) = match (is_float, cond) {
1112 (false, Cond::Eq) => (VecALUOp::Cmeq, false),
1113 (false, Cond::Ne) => (VecALUOp::Cmeq, false),
1114 (false, Cond::Ge) => (VecALUOp::Cmge, false),
1115 (false, Cond::Gt) => (VecALUOp::Cmgt, false),
1116 (false, Cond::Le) => (VecALUOp::Cmge, true),
1117 (false, Cond::Lt) => (VecALUOp::Cmgt, true),
1118 (false, Cond::Hs) => (VecALUOp::Cmhs, false),
1119 (false, Cond::Hi) => (VecALUOp::Cmhi, false),
1120 (false, Cond::Ls) => (VecALUOp::Cmhs, true),
1121 (false, Cond::Lo) => (VecALUOp::Cmhi, true),
1122 (true, Cond::Eq) => (VecALUOp::Fcmeq, false),
1123 (true, Cond::Ne) => (VecALUOp::Fcmeq, false),
1124 (true, Cond::Mi) => (VecALUOp::Fcmgt, true),
1125 (true, Cond::Ls) => (VecALUOp::Fcmge, true),
1126 (true, Cond::Ge) => (VecALUOp::Fcmge, false),
1127 (true, Cond::Gt) => (VecALUOp::Fcmgt, false),
1128 _ => unreachable!(),
1129 };
1130
1131 if swap {
1132 std::mem::swap(&mut rn, &mut rm);
1133 }
1134
1135 ctx.emit(Inst::VecRRR {
1136 alu_op,
1137 rd,
1138 rn,
1139 rm,
1140 size,
1141 });
1142
1143 if cond == Cond::Ne {
1144 ctx.emit(Inst::VecMisc {
1145 op: VecMisc2::Not,
1146 rd,
1147 rn: rd.to_reg(),
1148 size,
1149 });
1150 }
1151
1152 Ok(())
1153 }
1154
1155 /// Determines whether this condcode interprets inputs as signed or unsigned. See the
1156 /// documentation for the `icmp` instruction in cranelift-codegen/meta/src/shared/instructions.rs
1157 /// for further insights into this.
condcode_is_signed(cc: IntCC) -> bool1158 pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
1159 match cc {
1160 IntCC::Equal
1161 | IntCC::UnsignedGreaterThanOrEqual
1162 | IntCC::UnsignedGreaterThan
1163 | IntCC::UnsignedLessThanOrEqual
1164 | IntCC::UnsignedLessThan
1165 | IntCC::NotEqual => false,
1166 IntCC::SignedGreaterThanOrEqual
1167 | IntCC::SignedGreaterThan
1168 | IntCC::SignedLessThanOrEqual
1169 | IntCC::SignedLessThan
1170 | IntCC::Overflow
1171 | IntCC::NotOverflow => true,
1172 }
1173 }
1174
1175 //=============================================================================
1176 // Helpers for instruction lowering.
1177
choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T1178 pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
1179 let bits = ty_bits(ty);
1180 if bits <= 32 {
1181 op32
1182 } else if bits == 64 {
1183 op64
1184 } else {
1185 panic!("choose_32_64 on > 64 bits!")
1186 }
1187 }
1188
1189 /// Checks for an instance of `op` feeding the given input.
maybe_input_insn<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, op: Opcode, ) -> Option<IRInst>1190 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
1191 c: &mut C,
1192 input: InsnInput,
1193 op: Opcode,
1194 ) -> Option<IRInst> {
1195 let inputs = c.get_input_as_source_or_const(input.insn, input.input);
1196 log::trace!(
1197 "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
1198 input,
1199 inputs,
1200 op
1201 );
1202 if let Some((src_inst, _)) = inputs.inst {
1203 let data = c.data(src_inst);
1204 log::trace!(" -> input inst {:?}", data);
1205 if data.opcode() == op {
1206 return Some(src_inst);
1207 }
1208 }
1209 None
1210 }
1211
1212 /// Checks for an instance of any one of `ops` feeding the given input.
maybe_input_insn_multi<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, ops: &[Opcode], ) -> Option<(Opcode, IRInst)>1213 pub(crate) fn maybe_input_insn_multi<C: LowerCtx<I = Inst>>(
1214 c: &mut C,
1215 input: InsnInput,
1216 ops: &[Opcode],
1217 ) -> Option<(Opcode, IRInst)> {
1218 for &op in ops {
1219 if let Some(inst) = maybe_input_insn(c, input, op) {
1220 return Some((op, inst));
1221 }
1222 }
1223 None
1224 }
1225
1226 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
1227 /// Bint or a bitcast).
1228 ///
1229 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
1230 /// a bit more generic.
maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>( c: &mut C, input: InsnInput, op: Opcode, conv: Opcode, ) -> Option<IRInst>1231 pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
1232 c: &mut C,
1233 input: InsnInput,
1234 op: Opcode,
1235 conv: Opcode,
1236 ) -> Option<IRInst> {
1237 let inputs = c.get_input_as_source_or_const(input.insn, input.input);
1238 if let Some((src_inst, _)) = inputs.inst {
1239 let data = c.data(src_inst);
1240 if data.opcode() == op {
1241 return Some(src_inst);
1242 }
1243 if data.opcode() == conv {
1244 let inputs = c.get_input_as_source_or_const(src_inst, 0);
1245 if let Some((src_inst, _)) = inputs.inst {
1246 let data = c.data(src_inst);
1247 if data.opcode() == op {
1248 return Some(src_inst);
1249 }
1250 }
1251 }
1252 }
1253 None
1254 }
1255
1256 /// Pattern match an extending vector multiplication.
1257 /// Returns a tuple of the opcode to use, the two input registers and whether
1258 /// it's the 'high half' version of the instruction.
match_vec_long_mul<C: LowerCtx<I = Inst>>( c: &mut C, insn: IRInst, ext_op: Opcode, ) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)>1259 pub(crate) fn match_vec_long_mul<C: LowerCtx<I = Inst>>(
1260 c: &mut C,
1261 insn: IRInst,
1262 ext_op: Opcode,
1263 ) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> {
1264 let inputs = insn_inputs(c, insn);
1265 if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) {
1266 if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) {
1267 let lhs_input = insn_inputs(c, lhs)[0];
1268 let rhs_input = insn_inputs(c, rhs)[0];
1269 let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None);
1270 let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None);
1271 let lane_type = c.output_ty(insn, 0).lane_type();
1272 match (lane_type, ext_op) {
1273 (I16, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull8, rn, rm, false)),
1274 (I16, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull8, rn, rm, true)),
1275 (I16, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull8, rn, rm, false)),
1276 (I16, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull8, rn, rm, true)),
1277 (I32, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull16, rn, rm, false)),
1278 (I32, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull16, rn, rm, true)),
1279 (I32, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull16, rn, rm, false)),
1280 (I32, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull16, rn, rm, true)),
1281 (I64, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull32, rn, rm, false)),
1282 (I64, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull32, rn, rm, true)),
1283 (I64, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull32, rn, rm, false)),
1284 (I64, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull32, rn, rm, true)),
1285 _ => {}
1286 };
1287 }
1288 }
1289 None
1290 }
1291
lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst)1292 pub(crate) fn lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst) {
1293 let inputs = insn_inputs(c, insn);
1294 let outputs = insn_outputs(c, insn);
1295 let rd = get_output_reg(c, outputs[0]).regs()[0];
1296 let rn = put_input_in_regs(c, inputs[0]).regs()[0];
1297 let rm = put_input_in_regs(c, inputs[1]).regs()[0];
1298
1299 let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap();
1300 let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap();
1301
1302 // This I64X2 multiplication is performed with several 32-bit
1303 // operations.
1304
1305 // 64-bit numbers x and y, can be represented as:
1306 // x = a + 2^32(b)
1307 // y = c + 2^32(d)
1308
1309 // A 64-bit multiplication is:
1310 // x * y = ac + 2^32(ad + bc) + 2^64(bd)
1311 // note: `2^64(bd)` can be ignored, the value is too large to fit in
1312 // 64 bits.
1313
1314 // This sequence implements a I64X2 multiply, where the registers
1315 // `rn` and `rm` are split up into 32-bit components:
1316 // rn = |d|c|b|a|
1317 // rm = |h|g|f|e|
1318 //
1319 // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
1320 //
1321 // The sequence is:
1322 // rev64 rd.4s, rm.4s
1323 // mul rd.4s, rd.4s, rn.4s
1324 // xtn tmp1.2s, rn.2d
1325 // addp rd.4s, rd.4s, rd.4s
1326 // xtn tmp2.2s, rm.2d
1327 // shll rd.2d, rd.2s, #32
1328 // umlal rd.2d, tmp2.2s, tmp1.2s
1329
1330 // Reverse the 32-bit elements in the 64-bit words.
1331 // rd = |g|h|e|f|
1332 c.emit(Inst::VecMisc {
1333 op: VecMisc2::Rev64,
1334 rd,
1335 rn: rm,
1336 size: VectorSize::Size32x4,
1337 });
1338
1339 // Calculate the high half components.
1340 // rd = |dg|ch|be|af|
1341 //
1342 // Note that this 32-bit multiply of the high half
1343 // discards the bits that would overflow, same as
1344 // if 64-bit operations were used. Also the Shll
1345 // below would shift out the overflow bits anyway.
1346 c.emit(Inst::VecRRR {
1347 alu_op: VecALUOp::Mul,
1348 rd,
1349 rn: rd.to_reg(),
1350 rm: rn,
1351 size: VectorSize::Size32x4,
1352 });
1353
1354 // Extract the low half components of rn.
1355 // tmp1 = |c|a|
1356 c.emit(Inst::VecRRNarrow {
1357 op: VecRRNarrowOp::Xtn64,
1358 rd: tmp1,
1359 rn,
1360 high_half: false,
1361 });
1362
1363 // Sum the respective high half components.
1364 // rd = |dg+ch|be+af||dg+ch|be+af|
1365 c.emit(Inst::VecRRR {
1366 alu_op: VecALUOp::Addp,
1367 rd: rd,
1368 rn: rd.to_reg(),
1369 rm: rd.to_reg(),
1370 size: VectorSize::Size32x4,
1371 });
1372
1373 // Extract the low half components of rm.
1374 // tmp2 = |g|e|
1375 c.emit(Inst::VecRRNarrow {
1376 op: VecRRNarrowOp::Xtn64,
1377 rd: tmp2,
1378 rn: rm,
1379 high_half: false,
1380 });
1381
1382 // Shift the high half components, into the high half.
1383 // rd = |dg+ch << 32|be+af << 32|
1384 c.emit(Inst::VecRRLong {
1385 op: VecRRLongOp::Shll32,
1386 rd,
1387 rn: rd.to_reg(),
1388 high_half: false,
1389 });
1390
1391 // Multiply the low components together, and accumulate with the high
1392 // half.
1393 // rd = |rd[1] + cg|rd[0] + ae|
1394 c.emit(Inst::VecRRRLong {
1395 alu_op: VecRRRLongOp::Umlal32,
1396 rd,
1397 rn: tmp2.to_reg(),
1398 rm: tmp1.to_reg(),
1399 high_half: false,
1400 });
1401 }
1402
1403 /// Specifies what [lower_icmp] should do when lowering
1404 #[derive(Debug, Clone, PartialEq)]
1405 pub(crate) enum IcmpOutput {
1406 /// Lowers the comparison into a cond code, discarding the results. The cond code emitted can
1407 /// be checked in the resulting [IcmpResult].
1408 CondCode,
1409 /// Materializes the results into a register. This may overwrite any flags previously set.
1410 Register(Writable<Reg>),
1411 }
1412
1413 impl IcmpOutput {
reg(&self) -> Option<Writable<Reg>>1414 pub fn reg(&self) -> Option<Writable<Reg>> {
1415 match self {
1416 IcmpOutput::CondCode => None,
1417 IcmpOutput::Register(reg) => Some(*reg),
1418 }
1419 }
1420 }
1421
1422 /// The output of an Icmp lowering.
1423 #[derive(Debug, Clone, PartialEq)]
1424 pub(crate) enum IcmpResult {
1425 /// The result was output into the given [Cond]. Callers may perform operations using this [Cond]
1426 /// and its inverse, other [Cond]'s are not guaranteed to be correct.
1427 CondCode(Cond),
1428 /// The result was materialized into the output register.
1429 Register,
1430 }
1431
1432 impl IcmpResult {
unwrap_cond(&self) -> Cond1433 pub fn unwrap_cond(&self) -> Cond {
1434 match self {
1435 IcmpResult::CondCode(c) => *c,
1436 _ => panic!("Unwrapped cond, but IcmpResult was {:?}", self),
1437 }
1438 }
1439 }
1440
1441 /// Lower an icmp comparision
1442 ///
1443 /// We can lower into the status flags, or materialize the result into a register
1444 /// This is controlled by the `output` parameter.
lower_icmp<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, condcode: IntCC, output: IcmpOutput, ) -> CodegenResult<IcmpResult>1445 pub(crate) fn lower_icmp<C: LowerCtx<I = Inst>>(
1446 ctx: &mut C,
1447 insn: IRInst,
1448 condcode: IntCC,
1449 output: IcmpOutput,
1450 ) -> CodegenResult<IcmpResult> {
1451 log::trace!(
1452 "lower_icmp: insn {}, condcode: {}, output: {:?}",
1453 insn,
1454 condcode,
1455 output
1456 );
1457
1458 let rd = output.reg().unwrap_or(writable_zero_reg());
1459 let inputs = insn_inputs(ctx, insn);
1460 let cond = lower_condcode(condcode);
1461 let is_signed = condcode_is_signed(condcode);
1462 let ty = ctx.input_ty(insn, 0);
1463 let bits = ty_bits(ty);
1464 let narrow_mode = match (bits <= 32, is_signed) {
1465 (true, true) => NarrowValueMode::SignExtend32,
1466 (true, false) => NarrowValueMode::ZeroExtend32,
1467 (false, true) => NarrowValueMode::SignExtend64,
1468 (false, false) => NarrowValueMode::ZeroExtend64,
1469 };
1470 let mut should_materialize = output.reg().is_some();
1471
1472 let out_condcode = if ty == I128 {
1473 let lhs = put_input_in_regs(ctx, inputs[0]);
1474 let rhs = put_input_in_regs(ctx, inputs[1]);
1475
1476 let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1477 let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1478
1479 match condcode {
1480 IntCC::Equal | IntCC::NotEqual => {
1481 // eor tmp1, lhs_lo, rhs_lo
1482 // eor tmp2, lhs_hi, rhs_hi
1483 // adds xzr, tmp1, tmp2
1484 // cset dst, {eq, ne}
1485
1486 ctx.emit(Inst::AluRRR {
1487 alu_op: ALUOp::Eor64,
1488 rd: tmp1,
1489 rn: lhs.regs()[0],
1490 rm: rhs.regs()[0],
1491 });
1492 ctx.emit(Inst::AluRRR {
1493 alu_op: ALUOp::Eor64,
1494 rd: tmp2,
1495 rn: lhs.regs()[1],
1496 rm: rhs.regs()[1],
1497 });
1498 ctx.emit(Inst::AluRRR {
1499 alu_op: ALUOp::AddS64,
1500 rd: writable_zero_reg(),
1501 rn: tmp1.to_reg(),
1502 rm: tmp2.to_reg(),
1503 });
1504 }
1505 IntCC::Overflow | IntCC::NotOverflow => {
1506 // We can do an 128bit add while throwing away the results
1507 // and check the overflow flags at the end.
1508 //
1509 // adds xzr, lhs_lo, rhs_lo
1510 // adcs xzr, lhs_hi, rhs_hi
1511 // cset dst, {vs, vc}
1512
1513 ctx.emit(Inst::AluRRR {
1514 alu_op: ALUOp::AddS64,
1515 rd: writable_zero_reg(),
1516 rn: lhs.regs()[0],
1517 rm: rhs.regs()[0],
1518 });
1519 ctx.emit(Inst::AluRRR {
1520 alu_op: ALUOp::AdcS64,
1521 rd: writable_zero_reg(),
1522 rn: lhs.regs()[1],
1523 rm: rhs.regs()[1],
1524 });
1525 }
1526 _ => {
1527 // cmp lhs_lo, rhs_lo
1528 // cset tmp1, unsigned_cond
1529 // cmp lhs_hi, rhs_hi
1530 // cset tmp2, cond
1531 // csel dst, tmp1, tmp2, eq
1532
1533 let rd = output.reg().unwrap_or(tmp1);
1534 let unsigned_cond = lower_condcode(condcode.unsigned());
1535
1536 ctx.emit(Inst::AluRRR {
1537 alu_op: ALUOp::SubS64,
1538 rd: writable_zero_reg(),
1539 rn: lhs.regs()[0],
1540 rm: rhs.regs()[0],
1541 });
1542 materialize_bool_result(ctx, insn, tmp1, unsigned_cond);
1543 ctx.emit(Inst::AluRRR {
1544 alu_op: ALUOp::SubS64,
1545 rd: writable_zero_reg(),
1546 rn: lhs.regs()[1],
1547 rm: rhs.regs()[1],
1548 });
1549 materialize_bool_result(ctx, insn, tmp2, cond);
1550 ctx.emit(Inst::CSel {
1551 cond: Cond::Eq,
1552 rd,
1553 rn: tmp1.to_reg(),
1554 rm: tmp2.to_reg(),
1555 });
1556
1557 if output == IcmpOutput::CondCode {
1558 // We only need to guarantee that the flags for `cond` are correct, so we can
1559 // compare rd with 0 or 1
1560
1561 // If we are doing compare or equal, we want to compare with 1 instead of zero
1562 if condcode.without_equal() != condcode {
1563 lower_constant_u64(ctx, tmp2, 1);
1564 }
1565
1566 let xzr = zero_reg();
1567 let rd = rd.to_reg();
1568 let tmp2 = tmp2.to_reg();
1569 let (rn, rm) = match condcode {
1570 IntCC::SignedGreaterThanOrEqual => (rd, tmp2),
1571 IntCC::UnsignedGreaterThanOrEqual => (rd, tmp2),
1572 IntCC::SignedLessThanOrEqual => (tmp2, rd),
1573 IntCC::UnsignedLessThanOrEqual => (tmp2, rd),
1574 IntCC::SignedGreaterThan => (rd, xzr),
1575 IntCC::UnsignedGreaterThan => (rd, xzr),
1576 IntCC::SignedLessThan => (xzr, rd),
1577 IntCC::UnsignedLessThan => (xzr, rd),
1578 _ => unreachable!(),
1579 };
1580
1581 ctx.emit(Inst::AluRRR {
1582 alu_op: ALUOp::SubS64,
1583 rd: writable_zero_reg(),
1584 rn,
1585 rm,
1586 });
1587 }
1588
1589 // Prevent a second materialize_bool_result to be emitted at the end of the function
1590 should_materialize = false;
1591 }
1592 }
1593 cond
1594 } else if ty.is_vector() {
1595 assert_ne!(output, IcmpOutput::CondCode);
1596 should_materialize = false;
1597
1598 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1599 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
1600 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1601 cond
1602 } else {
1603 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1604 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
1605
1606 let is_overflow = condcode == IntCC::Overflow || condcode == IntCC::NotOverflow;
1607 let is_small_type = ty == I8 || ty == I16;
1608 let (cond, rn, rm) = if is_overflow && is_small_type {
1609 // Overflow checks for non native types require additional instructions, other than
1610 // just the extend op.
1611 //
1612 // TODO: Codegen improvements: Merge the second sxt{h,b} into the following sub instruction.
1613 //
1614 // sxt{h,b} w0, w0
1615 // sxt{h,b} w1, w1
1616 // sub w0, w0, w1
1617 // cmp w0, w0, sxt{h,b}
1618 //
1619 // The result of this comparison is either the EQ or NE condition code, so we need to
1620 // signal that to the caller
1621
1622 let extend_op = if ty == I8 {
1623 ExtendOp::SXTB
1624 } else {
1625 ExtendOp::SXTH
1626 };
1627 let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
1628 ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp1, rn, rm));
1629
1630 let out_cond = match condcode {
1631 IntCC::Overflow => Cond::Ne,
1632 IntCC::NotOverflow => Cond::Eq,
1633 _ => unreachable!(),
1634 };
1635 (
1636 out_cond,
1637 tmp1.to_reg(),
1638 ResultRSEImm12::RegExtend(tmp1.to_reg(), extend_op),
1639 )
1640 } else {
1641 (cond, rn, rm)
1642 };
1643
1644 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
1645 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
1646 cond
1647 };
1648
1649 // Most of the comparisons above produce flags by default, if the caller requested the result
1650 // in a register we materialize those flags into a register. Some branches do end up producing
1651 // the result as a register by default, so we ignore those.
1652 if should_materialize {
1653 materialize_bool_result(ctx, insn, rd, cond);
1654 }
1655
1656 Ok(match output {
1657 // We currently never emit a different register than what was asked for
1658 IcmpOutput::Register(_) => IcmpResult::Register,
1659 IcmpOutput::CondCode => IcmpResult::CondCode(out_condcode),
1660 })
1661 }
1662
lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst)1663 pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
1664 let ty = ctx.input_ty(insn, 0);
1665 let bits = ty_bits(ty);
1666 let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
1667 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1668 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1669 match bits {
1670 32 => {
1671 ctx.emit(Inst::FpuCmp32 { rn, rm });
1672 }
1673 64 => {
1674 ctx.emit(Inst::FpuCmp64 { rn, rm });
1675 }
1676 _ => panic!("Unknown float size"),
1677 }
1678 }
1679
1680 /// Materialize a boolean value into a register from the flags
1681 /// (e.g set by a comparison).
1682 /// A 0 / -1 (all-ones) result as expected for bool operations.
materialize_bool_result<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, rd: Writable<Reg>, cond: Cond, )1683 pub(crate) fn materialize_bool_result<C: LowerCtx<I = Inst>>(
1684 ctx: &mut C,
1685 insn: IRInst,
1686 rd: Writable<Reg>,
1687 cond: Cond,
1688 ) {
1689 // A boolean is 0 / -1; if output width is > 1 use `csetm`,
1690 // otherwise use `cset`.
1691 if ty_bits(ctx.output_ty(insn, 0)) > 1 {
1692 ctx.emit(Inst::CSetm { rd, cond });
1693 } else {
1694 ctx.emit(Inst::CSet { rd, cond });
1695 }
1696 }
1697
lower_shift_amt<C: LowerCtx<I = Inst>>( ctx: &mut C, amt_input: InsnInput, dst_ty: Type, tmp_reg: Writable<Reg>, ) -> ResultRegImmShift1698 pub(crate) fn lower_shift_amt<C: LowerCtx<I = Inst>>(
1699 ctx: &mut C,
1700 amt_input: InsnInput,
1701 dst_ty: Type,
1702 tmp_reg: Writable<Reg>,
1703 ) -> ResultRegImmShift {
1704 let amt_ty = ctx.input_ty(amt_input.insn, amt_input.input);
1705
1706 match (dst_ty, amt_ty) {
1707 // When shifting for amounts larger than the size of the type, the CLIF shift
1708 // instructions implement a "wrapping" behaviour, such that an i8 << 8 is
1709 // equivalent to i8 << 0
1710 //
1711 // On i32 and i64 types this matches what the aarch64 spec does, but on smaller
1712 // types (i16, i8) we need to do this manually, so we wrap the shift amount
1713 // with an AND instruction
1714 (I16 | I8, _) => {
1715 // We can ignore the top half of the shift amount register if the type is I128
1716 let amt_reg = put_input_in_regs(ctx, amt_input).regs()[0];
1717 let mask = (ty_bits(dst_ty) - 1) as u64;
1718 ctx.emit(Inst::AluRRImmLogic {
1719 alu_op: ALUOp::And32,
1720 rd: tmp_reg,
1721 rn: amt_reg,
1722 imml: ImmLogic::maybe_from_u64(mask, I32).unwrap(),
1723 });
1724 ResultRegImmShift::Reg(tmp_reg.to_reg())
1725 }
1726 // TODO: We can use immlogic for i128 types here
1727 (I128, _) | (_, I128) => {
1728 // For I128 shifts, we need a register without immlogic
1729 ResultRegImmShift::Reg(put_input_in_regs(ctx, amt_input).regs()[0])
1730 }
1731 _ => put_input_in_reg_immshift(ctx, amt_input, ty_bits(dst_ty)),
1732 }
1733 }
1734
1735 /// This is target-word-size dependent. And it excludes booleans and reftypes.
is_valid_atomic_transaction_ty(ty: Type) -> bool1736 pub(crate) fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
1737 match ty {
1738 I8 | I16 | I32 | I64 => true,
1739 _ => false,
1740 }
1741 }
1742
emit_atomic_load<C: LowerCtx<I = Inst>>( ctx: &mut C, rt: Writable<Reg>, insn: IRInst, )1743 pub(crate) fn emit_atomic_load<C: LowerCtx<I = Inst>>(
1744 ctx: &mut C,
1745 rt: Writable<Reg>,
1746 insn: IRInst,
1747 ) {
1748 assert!(ctx.data(insn).opcode() == Opcode::AtomicLoad);
1749 let inputs = insn_inputs(ctx, insn);
1750 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1751 let access_ty = ctx.output_ty(insn, 0);
1752 assert!(is_valid_atomic_transaction_ty(access_ty));
1753 // We're ignoring the result type of the load because the LoadAcquire will
1754 // explicitly zero extend to the nearest word, and also zero the high half
1755 // of an X register.
1756 ctx.emit(Inst::LoadAcquire { access_ty, rt, rn });
1757 }
1758
load_op_to_ty(op: Opcode) -> Option<Type>1759 fn load_op_to_ty(op: Opcode) -> Option<Type> {
1760 match op {
1761 Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8),
1762 Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => {
1763 Some(I16)
1764 }
1765 Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => {
1766 Some(I32)
1767 }
1768 Opcode::Load | Opcode::LoadComplex => None,
1769 Opcode::Sload8x8 | Opcode::Uload8x8 | Opcode::Sload8x8Complex | Opcode::Uload8x8Complex => {
1770 Some(I8X8)
1771 }
1772 Opcode::Sload16x4
1773 | Opcode::Uload16x4
1774 | Opcode::Sload16x4Complex
1775 | Opcode::Uload16x4Complex => Some(I16X4),
1776 Opcode::Sload32x2
1777 | Opcode::Uload32x2
1778 | Opcode::Sload32x2Complex
1779 | Opcode::Uload32x2Complex => Some(I32X2),
1780 _ => None,
1781 }
1782 }
1783
1784 /// Helper to lower a load instruction; this is used in several places, because
1785 /// a load can sometimes be merged into another operation.
lower_load< C: LowerCtx<I = Inst>, F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode), >( ctx: &mut C, ir_inst: IRInst, inputs: &[InsnInput], output: InsnOutput, mut f: F, )1786 pub(crate) fn lower_load<
1787 C: LowerCtx<I = Inst>,
1788 F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
1789 >(
1790 ctx: &mut C,
1791 ir_inst: IRInst,
1792 inputs: &[InsnInput],
1793 output: InsnOutput,
1794 mut f: F,
1795 ) {
1796 let op = ctx.data(ir_inst).opcode();
1797
1798 let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0));
1799
1800 let off = ctx.data(ir_inst).load_store_offset().unwrap();
1801 let mem = lower_address(ctx, elem_ty, &inputs[..], off);
1802 let rd = get_output_reg(ctx, output);
1803
1804 f(ctx, rd, elem_ty, mem);
1805 }
1806
emit_shl_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt: Reg, )1807 pub(crate) fn emit_shl_i128<C: LowerCtx<I = Inst>>(
1808 ctx: &mut C,
1809 src: ValueRegs<Reg>,
1810 dst: ValueRegs<Writable<Reg>>,
1811 amt: Reg,
1812 ) {
1813 let src_lo = src.regs()[0];
1814 let src_hi = src.regs()[1];
1815 let dst_lo = dst.regs()[0];
1816 let dst_hi = dst.regs()[1];
1817
1818 // mvn inv_amt, amt
1819 // lsr tmp1, src_lo, #1
1820 // lsl tmp2, src_hi, amt
1821 // lsr tmp1, tmp1, inv_amt
1822 // lsl tmp3, src_lo, amt
1823 // tst amt, #0x40
1824 // orr tmp2, tmp2, tmp1
1825 // csel dst_hi, tmp3, tmp2, ne
1826 // csel dst_lo, xzr, tmp3, ne
1827
1828 let xzr = writable_zero_reg();
1829 let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
1830 let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1831 let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1832 let tmp3 = ctx.alloc_tmp(I64).only_reg().unwrap();
1833
1834 ctx.emit(Inst::AluRRR {
1835 alu_op: ALUOp::OrrNot32,
1836 rd: inv_amt,
1837 rn: xzr.to_reg(),
1838 rm: amt,
1839 });
1840
1841 ctx.emit(Inst::AluRRImmShift {
1842 alu_op: ALUOp::Lsr64,
1843 rd: tmp1,
1844 rn: src_lo,
1845 immshift: ImmShift::maybe_from_u64(1).unwrap(),
1846 });
1847
1848 ctx.emit(Inst::AluRRR {
1849 alu_op: ALUOp::Lsl64,
1850 rd: tmp2,
1851 rn: src_hi,
1852 rm: amt,
1853 });
1854
1855 ctx.emit(Inst::AluRRR {
1856 alu_op: ALUOp::Lsr64,
1857 rd: tmp1,
1858 rn: tmp1.to_reg(),
1859 rm: inv_amt.to_reg(),
1860 });
1861
1862 ctx.emit(Inst::AluRRR {
1863 alu_op: ALUOp::Lsl64,
1864 rd: tmp3,
1865 rn: src_lo,
1866 rm: amt,
1867 });
1868
1869 ctx.emit(Inst::AluRRImmLogic {
1870 alu_op: ALUOp::AndS64,
1871 rd: xzr,
1872 rn: amt,
1873 imml: ImmLogic::maybe_from_u64(64, I64).unwrap(),
1874 });
1875
1876 ctx.emit(Inst::AluRRR {
1877 alu_op: ALUOp::Orr64,
1878 rd: tmp2,
1879 rn: tmp2.to_reg(),
1880 rm: tmp1.to_reg(),
1881 });
1882
1883 ctx.emit(Inst::CSel {
1884 cond: Cond::Ne,
1885 rd: dst_hi,
1886 rn: tmp3.to_reg(),
1887 rm: tmp2.to_reg(),
1888 });
1889
1890 ctx.emit(Inst::CSel {
1891 cond: Cond::Ne,
1892 rd: dst_lo,
1893 rn: xzr.to_reg(),
1894 rm: tmp3.to_reg(),
1895 });
1896 }
1897
emit_shr_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt: Reg, is_signed: bool, )1898 pub(crate) fn emit_shr_i128<C: LowerCtx<I = Inst>>(
1899 ctx: &mut C,
1900 src: ValueRegs<Reg>,
1901 dst: ValueRegs<Writable<Reg>>,
1902 amt: Reg,
1903 is_signed: bool,
1904 ) {
1905 let src_lo = src.regs()[0];
1906 let src_hi = src.regs()[1];
1907 let dst_lo = dst.regs()[0];
1908 let dst_hi = dst.regs()[1];
1909
1910 // mvn inv_amt, amt
1911 // lsl tmp1, src_lo, #1
1912 // lsr tmp2, src_hi, amt
1913 // lsl tmp1, tmp1, inv_amt
1914 // lsr/asr tmp3, src_lo, amt
1915 // tst amt, #0x40
1916 // orr tmp2, tmp2, tmp1
1917 //
1918 // if signed:
1919 // asr tmp4, src_hi, #63
1920 // csel dst_hi, tmp4, tmp3, ne
1921 // else:
1922 // csel dst_hi, xzr, tmp3, ne
1923 //
1924 // csel dst_lo, tmp3, tmp2, ne
1925
1926 let xzr = writable_zero_reg();
1927 let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
1928 let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
1929 let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
1930 let tmp3 = ctx.alloc_tmp(I64).only_reg().unwrap();
1931 let tmp4 = ctx.alloc_tmp(I64).only_reg().unwrap();
1932
1933 let shift_op = if is_signed {
1934 ALUOp::Asr64
1935 } else {
1936 ALUOp::Lsr64
1937 };
1938
1939 ctx.emit(Inst::AluRRR {
1940 alu_op: ALUOp::OrrNot32,
1941 rd: inv_amt,
1942 rn: xzr.to_reg(),
1943 rm: amt,
1944 });
1945
1946 ctx.emit(Inst::AluRRImmShift {
1947 alu_op: ALUOp::Lsl64,
1948 rd: tmp1,
1949 rn: src_hi,
1950 immshift: ImmShift::maybe_from_u64(1).unwrap(),
1951 });
1952
1953 ctx.emit(Inst::AluRRR {
1954 alu_op: ALUOp::Lsr64,
1955 rd: tmp2,
1956 rn: src_lo,
1957 rm: amt,
1958 });
1959
1960 ctx.emit(Inst::AluRRR {
1961 alu_op: ALUOp::Lsl64,
1962 rd: tmp1,
1963 rn: tmp1.to_reg(),
1964 rm: inv_amt.to_reg(),
1965 });
1966
1967 ctx.emit(Inst::AluRRR {
1968 alu_op: shift_op,
1969 rd: tmp3,
1970 rn: src_hi,
1971 rm: amt,
1972 });
1973
1974 ctx.emit(Inst::AluRRImmLogic {
1975 alu_op: ALUOp::AndS64,
1976 rd: xzr,
1977 rn: amt,
1978 imml: ImmLogic::maybe_from_u64(64, I64).unwrap(),
1979 });
1980
1981 if is_signed {
1982 ctx.emit(Inst::AluRRImmShift {
1983 alu_op: ALUOp::Asr64,
1984 rd: tmp4,
1985 rn: src_hi,
1986 immshift: ImmShift::maybe_from_u64(63).unwrap(),
1987 });
1988 }
1989
1990 ctx.emit(Inst::AluRRR {
1991 alu_op: ALUOp::Orr64,
1992 rd: tmp2,
1993 rn: tmp2.to_reg(),
1994 rm: tmp1.to_reg(),
1995 });
1996
1997 ctx.emit(Inst::CSel {
1998 cond: Cond::Ne,
1999 rd: dst_hi,
2000 rn: if is_signed { tmp4 } else { xzr }.to_reg(),
2001 rm: tmp3.to_reg(),
2002 });
2003
2004 ctx.emit(Inst::CSel {
2005 cond: Cond::Ne,
2006 rd: dst_lo,
2007 rn: tmp3.to_reg(),
2008 rm: tmp2.to_reg(),
2009 });
2010 }
2011
emit_clz_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, )2012 pub(crate) fn emit_clz_i128<C: LowerCtx<I = Inst>>(
2013 ctx: &mut C,
2014 src: ValueRegs<Reg>,
2015 dst: ValueRegs<Writable<Reg>>,
2016 ) {
2017 let src_lo = src.regs()[0];
2018 let src_hi = src.regs()[1];
2019 let dst_lo = dst.regs()[0];
2020 let dst_hi = dst.regs()[1];
2021
2022 // clz dst_hi, src_hi
2023 // clz dst_lo, src_lo
2024 // lsr tmp, dst_hi, #6
2025 // madd dst_lo, dst_lo, tmp, dst_hi
2026 // mov dst_hi, 0
2027
2028 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
2029
2030 ctx.emit(Inst::BitRR {
2031 rd: dst_hi,
2032 rn: src_hi,
2033 op: BitOp::Clz64,
2034 });
2035 ctx.emit(Inst::BitRR {
2036 rd: dst_lo,
2037 rn: src_lo,
2038 op: BitOp::Clz64,
2039 });
2040 ctx.emit(Inst::AluRRImmShift {
2041 alu_op: ALUOp::Lsr64,
2042 rd: tmp,
2043 rn: dst_hi.to_reg(),
2044 immshift: ImmShift::maybe_from_u64(6).unwrap(),
2045 });
2046 ctx.emit(Inst::AluRRRR {
2047 alu_op: ALUOp3::MAdd64,
2048 rd: dst_lo,
2049 rn: dst_lo.to_reg(),
2050 rm: tmp.to_reg(),
2051 ra: dst_hi.to_reg(),
2052 });
2053 lower_constant_u64(ctx, dst_hi, 0);
2054 }
2055
2056 //=============================================================================
2057 // Lowering-backend trait implementation.
2058
2059 impl LowerBackend for AArch64Backend {
2060 type MInst = Inst;
2061
lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()>2062 fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
2063 lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.isa_flags)
2064 }
2065
lower_branch_group<C: LowerCtx<I = Inst>>( &self, ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>2066 fn lower_branch_group<C: LowerCtx<I = Inst>>(
2067 &self,
2068 ctx: &mut C,
2069 branches: &[IRInst],
2070 targets: &[MachLabel],
2071 ) -> CodegenResult<()> {
2072 lower_inst::lower_branch(ctx, branches, targets)
2073 }
2074
maybe_pinned_reg(&self) -> Option<Reg>2075 fn maybe_pinned_reg(&self) -> Option<Reg> {
2076 Some(xreg(PINNED_REG))
2077 }
2078 }
2079