1 //! Lower a single Cranelift instruction into vcode.
2 
3 use crate::binemit::CodeOffset;
4 use crate::ir::condcodes::FloatCC;
5 use crate::ir::types::*;
6 use crate::ir::Inst as IRInst;
7 use crate::ir::{InstructionData, Opcode, TrapCode};
8 use crate::isa::aarch64::settings as aarch64_settings;
9 use crate::machinst::lower::*;
10 use crate::machinst::*;
11 use crate::settings::{Flags, TlsModel};
12 use crate::{CodegenError, CodegenResult};
13 
14 use crate::isa::aarch64::abi::*;
15 use crate::isa::aarch64::inst::*;
16 
17 use regalloc::Writable;
18 
19 use alloc::boxed::Box;
20 use alloc::vec::Vec;
21 use core::convert::TryFrom;
22 
23 use super::lower::*;
24 
25 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()>26 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
27     ctx: &mut C,
28     insn: IRInst,
29     flags: &Flags,
30     isa_flags: &aarch64_settings::Flags,
31 ) -> CodegenResult<()> {
32     let op = ctx.data(insn).opcode();
33     let inputs = insn_inputs(ctx, insn);
34     let outputs = insn_outputs(ctx, insn);
35     let ty = if outputs.len() > 0 {
36         Some(ctx.output_ty(insn, 0))
37     } else {
38         None
39     };
40 
41     match op {
42         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
43             let value = ctx.get_constant(insn).unwrap();
44             // Sign extend constant if necessary
45             let value = match ty.unwrap() {
46                 I8 => (((value as i64) << 56) >> 56) as u64,
47                 I16 => (((value as i64) << 48) >> 48) as u64,
48                 I32 => (((value as i64) << 32) >> 32) as u64,
49                 I64 | R64 => value,
50                 ty if ty.is_bool() => value,
51                 ty => unreachable!("Unknown type for const: {}", ty),
52             };
53             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
54             lower_constant_u64(ctx, rd, value);
55         }
56         Opcode::F32const => {
57             let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
58             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
59             lower_constant_f32(ctx, rd, value);
60         }
61         Opcode::F64const => {
62             let value = f64::from_bits(ctx.get_constant(insn).unwrap());
63             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
64             lower_constant_f64(ctx, rd, value);
65         }
66         Opcode::Iadd => {
67             match ty.unwrap() {
68                 ty if ty.is_vector() => {
69                     let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
70                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
71                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
72                     ctx.emit(Inst::VecRRR {
73                         rd,
74                         rn,
75                         rm,
76                         alu_op: VecALUOp::Add,
77                         size: VectorSize::from_ty(ty),
78                     });
79                 }
80                 I128 => {
81                     let lhs = put_input_in_regs(ctx, inputs[0]);
82                     let rhs = put_input_in_regs(ctx, inputs[1]);
83                     let dst = get_output_reg(ctx, outputs[0]);
84                     assert_eq!(lhs.len(), 2);
85                     assert_eq!(rhs.len(), 2);
86                     assert_eq!(dst.len(), 2);
87 
88                     // adds    x0, x0, x2
89                     // adc     x1, x1, x3
90 
91                     ctx.emit(Inst::AluRRR {
92                         alu_op: ALUOp::AddS64,
93                         rd: dst.regs()[0],
94                         rn: lhs.regs()[0],
95                         rm: rhs.regs()[0],
96                     });
97                     ctx.emit(Inst::AluRRR {
98                         alu_op: ALUOp::Adc64,
99                         rd: dst.regs()[1],
100                         rn: lhs.regs()[1],
101                         rm: rhs.regs()[1],
102                     });
103                 }
104                 ty => {
105                     let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
106                     let mul_insn = if let Some(mul_insn) =
107                         maybe_input_insn(ctx, inputs[1], Opcode::Imul)
108                     {
109                         Some((mul_insn, 0))
110                     } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
111                         Some((mul_insn, 1))
112                     } else {
113                         None
114                     };
115                     // If possible combine mul + add into madd.
116                     if let Some((insn, addend_idx)) = mul_insn {
117                         let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
118                         let rn_input = InsnInput { insn, input: 0 };
119                         let rm_input = InsnInput { insn, input: 1 };
120 
121                         let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
122                         let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
123                         let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
124 
125                         ctx.emit(Inst::AluRRRR {
126                             alu_op,
127                             rd,
128                             rn,
129                             rm,
130                             ra,
131                         });
132                     } else {
133                         let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
134                         let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
135                             ctx,
136                             inputs[1],
137                             ty_bits(ty),
138                             NarrowValueMode::None,
139                         );
140                         let alu_op = if !negated {
141                             choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
142                         } else {
143                             choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
144                         };
145                         ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
146                     }
147                 }
148             }
149         }
150         Opcode::Isub => {
151             let ty = ty.unwrap();
152             if ty == I128 {
153                 let lhs = put_input_in_regs(ctx, inputs[0]);
154                 let rhs = put_input_in_regs(ctx, inputs[1]);
155                 let dst = get_output_reg(ctx, outputs[0]);
156                 assert_eq!(lhs.len(), 2);
157                 assert_eq!(rhs.len(), 2);
158                 assert_eq!(dst.len(), 2);
159 
160                 // subs    x0, x0, x2
161                 // sbc     x1, x1, x3
162 
163                 ctx.emit(Inst::AluRRR {
164                     alu_op: ALUOp::SubS64,
165                     rd: dst.regs()[0],
166                     rn: lhs.regs()[0],
167                     rm: rhs.regs()[0],
168                 });
169                 ctx.emit(Inst::AluRRR {
170                     alu_op: ALUOp::Sbc64,
171                     rd: dst.regs()[1],
172                     rn: lhs.regs()[1],
173                     rm: rhs.regs()[1],
174                 });
175             } else {
176                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
177                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
178                 if !ty.is_vector() {
179                     let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
180                         ctx,
181                         inputs[1],
182                         ty_bits(ty),
183                         NarrowValueMode::None,
184                     );
185                     let alu_op = if !negated {
186                         choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
187                     } else {
188                         choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
189                     };
190                     ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
191                 } else {
192                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
193                     ctx.emit(Inst::VecRRR {
194                         rd,
195                         rn,
196                         rm,
197                         alu_op: VecALUOp::Sub,
198                         size: VectorSize::from_ty(ty),
199                     });
200                 }
201             }
202         }
203         Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
204             let ty = ty.unwrap();
205             assert!(ty.is_vector());
206             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
207             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
208             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
209 
210             let alu_op = match op {
211                 Opcode::UaddSat => VecALUOp::Uqadd,
212                 Opcode::SaddSat => VecALUOp::Sqadd,
213                 Opcode::UsubSat => VecALUOp::Uqsub,
214                 Opcode::SsubSat => VecALUOp::Sqsub,
215                 _ => unreachable!(),
216             };
217 
218             ctx.emit(Inst::VecRRR {
219                 rd,
220                 rn,
221                 rm,
222                 alu_op,
223                 size: VectorSize::from_ty(ty),
224             });
225         }
226 
227         Opcode::Ineg => {
228             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
229             let ty = ty.unwrap();
230             if !ty.is_vector() {
231                 let rn = zero_reg();
232                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
233                 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
234                 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
235             } else {
236                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
237                 ctx.emit(Inst::VecMisc {
238                     op: VecMisc2::Neg,
239                     rd,
240                     rn,
241                     size: VectorSize::from_ty(ty),
242                 });
243             }
244         }
245 
246         Opcode::Imul => {
247             let ty = ty.unwrap();
248             if ty == I128 {
249                 let lhs = put_input_in_regs(ctx, inputs[0]);
250                 let rhs = put_input_in_regs(ctx, inputs[1]);
251                 let dst = get_output_reg(ctx, outputs[0]);
252                 assert_eq!(lhs.len(), 2);
253                 assert_eq!(rhs.len(), 2);
254                 assert_eq!(dst.len(), 2);
255 
256                 // 128bit mul formula:
257                 //   dst_lo = lhs_lo * rhs_lo
258                 //   dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
259                 //
260                 // We can convert the above formula into the following
261                 // umulh   dst_hi, lhs_lo, rhs_lo
262                 // madd    dst_hi, lhs_lo, rhs_hi, dst_hi
263                 // madd    dst_hi, lhs_hi, rhs_lo, dst_hi
264                 // mul     dst_lo, lhs_lo, rhs_lo
265 
266                 ctx.emit(Inst::AluRRR {
267                     alu_op: ALUOp::UMulH,
268                     rd: dst.regs()[1],
269                     rn: lhs.regs()[0],
270                     rm: rhs.regs()[0],
271                 });
272                 ctx.emit(Inst::AluRRRR {
273                     alu_op: ALUOp3::MAdd64,
274                     rd: dst.regs()[1],
275                     rn: lhs.regs()[0],
276                     rm: rhs.regs()[1],
277                     ra: dst.regs()[1].to_reg(),
278                 });
279                 ctx.emit(Inst::AluRRRR {
280                     alu_op: ALUOp3::MAdd64,
281                     rd: dst.regs()[1],
282                     rn: lhs.regs()[1],
283                     rm: rhs.regs()[0],
284                     ra: dst.regs()[1].to_reg(),
285                 });
286                 ctx.emit(Inst::AluRRRR {
287                     alu_op: ALUOp3::MAdd64,
288                     rd: dst.regs()[0],
289                     rn: lhs.regs()[0],
290                     rm: rhs.regs()[0],
291                     ra: zero_reg(),
292                 });
293             } else if ty.is_vector() {
294                 for ext_op in &[
295                     Opcode::SwidenLow,
296                     Opcode::SwidenHigh,
297                     Opcode::UwidenLow,
298                     Opcode::UwidenHigh,
299                 ] {
300                     if let Some((alu_op, rn, rm, high_half)) =
301                         match_vec_long_mul(ctx, insn, *ext_op)
302                     {
303                         let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
304                         ctx.emit(Inst::VecRRRLong {
305                             alu_op,
306                             rd,
307                             rn,
308                             rm,
309                             high_half,
310                         });
311                         return Ok(());
312                     }
313                 }
314                 if ty == I64X2 {
315                     lower_i64x2_mul(ctx, insn);
316                 } else {
317                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
318                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
319                     let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
320                     ctx.emit(Inst::VecRRR {
321                         alu_op: VecALUOp::Mul,
322                         rd,
323                         rn,
324                         rm,
325                         size: VectorSize::from_ty(ty),
326                     });
327                 }
328             } else {
329                 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
330                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
331                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
332                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
333                 ctx.emit(Inst::AluRRRR {
334                     alu_op,
335                     rd,
336                     rn,
337                     rm,
338                     ra: zero_reg(),
339                 });
340             }
341         }
342 
343         Opcode::Umulhi | Opcode::Smulhi => {
344             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
345             let is_signed = op == Opcode::Smulhi;
346             let input_ty = ctx.input_ty(insn, 0);
347             assert!(ctx.input_ty(insn, 1) == input_ty);
348             assert!(ctx.output_ty(insn, 0) == input_ty);
349 
350             match input_ty {
351                 I64 => {
352                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
353                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
354                     let alu_op = if is_signed {
355                         ALUOp::SMulH
356                     } else {
357                         ALUOp::UMulH
358                     };
359                     ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
360                 }
361                 I32 | I16 | I8 => {
362                     let narrow_mode = if is_signed {
363                         NarrowValueMode::SignExtend64
364                     } else {
365                         NarrowValueMode::ZeroExtend64
366                     };
367                     let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
368                     let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
369                     let ra = zero_reg();
370                     ctx.emit(Inst::AluRRRR {
371                         alu_op: ALUOp3::MAdd64,
372                         rd,
373                         rn,
374                         rm,
375                         ra,
376                     });
377                     let shift_op = if is_signed {
378                         ALUOp::Asr64
379                     } else {
380                         ALUOp::Lsr64
381                     };
382                     let shift_amt = match input_ty {
383                         I32 => 32,
384                         I16 => 16,
385                         I8 => 8,
386                         _ => unreachable!(),
387                     };
388                     ctx.emit(Inst::AluRRImmShift {
389                         alu_op: shift_op,
390                         rd,
391                         rn: rd.to_reg(),
392                         immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
393                     });
394                 }
395                 _ => {
396                     panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
397                 }
398             }
399         }
400 
401         Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
402             let is_signed = match op {
403                 Opcode::Udiv | Opcode::Urem => false,
404                 Opcode::Sdiv | Opcode::Srem => true,
405                 _ => unreachable!(),
406             };
407             let is_rem = match op {
408                 Opcode::Udiv | Opcode::Sdiv => false,
409                 Opcode::Urem | Opcode::Srem => true,
410                 _ => unreachable!(),
411             };
412             let narrow_mode = if is_signed {
413                 NarrowValueMode::SignExtend64
414             } else {
415                 NarrowValueMode::ZeroExtend64
416             };
417             // TODO: Add SDiv32 to implement 32-bit directly, rather
418             // than extending the input.
419             let div_op = if is_signed {
420                 ALUOp::SDiv64
421             } else {
422                 ALUOp::UDiv64
423             };
424 
425             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
426             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
427             let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
428             // The div instruction does not trap on divide by zero or signed overflow
429             // so checks are inserted below.
430             //
431             //   div rd, rn, rm
432             ctx.emit(Inst::AluRRR {
433                 alu_op: div_op,
434                 rd,
435                 rn,
436                 rm,
437             });
438 
439             if is_rem {
440                 // Remainder (rn % rm) is implemented as:
441                 //
442                 //   tmp = rn / rm
443                 //   rd = rn - (tmp*rm)
444                 //
445                 // use 'rd' for tmp and you have:
446                 //
447                 //   div rd, rn, rm       ; rd = rn / rm
448                 //   cbnz rm, #8          ; branch over trap
449                 //   udf                  ; divide by zero
450                 //   msub rd, rd, rm, rn  ; rd = rn - rd * rm
451 
452                 // Check for divide by 0.
453                 let trap_code = TrapCode::IntegerDivisionByZero;
454                 ctx.emit(Inst::TrapIf {
455                     trap_code,
456                     kind: CondBrKind::Zero(rm),
457                 });
458 
459                 ctx.emit(Inst::AluRRRR {
460                     alu_op: ALUOp3::MSub64,
461                     rd: rd,
462                     rn: rd.to_reg(),
463                     rm: rm,
464                     ra: rn,
465                 });
466             } else {
467                 if div_op == ALUOp::SDiv64 {
468                     //   cbnz rm, #8
469                     //   udf ; divide by zero
470                     //   cmn rm, 1
471                     //   ccmp rn, 1, #nzcv, eq
472                     //   b.vc #8
473                     //   udf ; signed overflow
474 
475                     // Check for divide by 0.
476                     let trap_code = TrapCode::IntegerDivisionByZero;
477                     ctx.emit(Inst::TrapIf {
478                         trap_code,
479                         kind: CondBrKind::Zero(rm),
480                     });
481 
482                     // Check for signed overflow. The only case is min_value / -1.
483                     let ty = ty.unwrap();
484                     // The following checks must be done in 32-bit or 64-bit, depending
485                     // on the input type. Even though the initial div instruction is
486                     // always done in 64-bit currently.
487                     let size = OperandSize::from_ty(ty);
488                     // Check RHS is -1.
489                     ctx.emit(Inst::AluRRImm12 {
490                         alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
491                         rd: writable_zero_reg(),
492                         rn: rm,
493                         imm12: Imm12::maybe_from_u64(1).unwrap(),
494                     });
495                     // Check LHS is min_value, by subtracting 1 and branching if
496                     // there is overflow.
497                     ctx.emit(Inst::CCmpImm {
498                         size,
499                         rn,
500                         imm: UImm5::maybe_from_u8(1).unwrap(),
501                         nzcv: NZCV::new(false, false, false, false),
502                         cond: Cond::Eq,
503                     });
504                     let trap_code = TrapCode::IntegerOverflow;
505                     ctx.emit(Inst::TrapIf {
506                         trap_code,
507                         kind: CondBrKind::Cond(Cond::Vs),
508                     });
509                 } else {
510                     //   cbnz rm, #8
511                     //   udf ; divide by zero
512 
513                     // Check for divide by 0.
514                     let trap_code = TrapCode::IntegerDivisionByZero;
515                     ctx.emit(Inst::TrapIf {
516                         trap_code,
517                         kind: CondBrKind::Zero(rm),
518                     });
519                 }
520             }
521         }
522 
523         Opcode::Uextend | Opcode::Sextend => {
524             if op == Opcode::Uextend {
525                 let inputs = ctx.get_input_as_source_or_const(inputs[0].insn, inputs[0].input);
526                 if let Some((atomic_load, 0)) = inputs.inst {
527                     if ctx.data(atomic_load).opcode() == Opcode::AtomicLoad {
528                         let output_ty = ty.unwrap();
529                         assert!(output_ty == I32 || output_ty == I64);
530                         let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
531                         emit_atomic_load(ctx, rt, atomic_load);
532                         ctx.sink_inst(atomic_load);
533                         return Ok(());
534                     }
535                 }
536             }
537             let output_ty = ty.unwrap();
538             let input_ty = ctx.input_ty(insn, 0);
539             let from_bits = ty_bits(input_ty) as u8;
540             let to_bits = ty_bits(output_ty) as u8;
541             let to_bits = std::cmp::max(32, to_bits);
542             assert!(from_bits <= to_bits);
543 
544             let signed = op == Opcode::Sextend;
545             let dst = get_output_reg(ctx, outputs[0]);
546             let src =
547                 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
548                     put_input_in_regs(
549                         ctx,
550                         InsnInput {
551                             insn: extract_insn,
552                             input: 0,
553                         },
554                     )
555                 } else {
556                     put_input_in_regs(ctx, inputs[0])
557                 };
558 
559             let needs_extend = from_bits < to_bits && to_bits <= 64;
560             // For i128, we want to extend the lower half, except if it is already 64 bits.
561             let needs_lower_extend = to_bits > 64 && from_bits < 64;
562             let pass_through_lower = to_bits > 64 && !needs_lower_extend;
563 
564             if needs_extend || needs_lower_extend {
565                 let rn = src.regs()[0];
566                 let rd = dst.regs()[0];
567 
568                 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
569                     let idx =
570                         if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
571                             *imm
572                         } else {
573                             unreachable!();
574                         };
575 
576                     let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
577 
578                     if signed {
579                         let scalar_size = OperandSize::from_ty(output_ty);
580 
581                         ctx.emit(Inst::MovFromVecSigned {
582                             rd,
583                             rn,
584                             idx,
585                             size,
586                             scalar_size,
587                         });
588                     } else {
589                         ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
590                     }
591                 } else {
592                     // If we reach this point, we weren't able to incorporate the extend as
593                     // a register-mode on another instruction, so we have a 'None'
594                     // narrow-value/extend mode here, and we emit the explicit instruction.
595                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
596                     ctx.emit(Inst::Extend {
597                         rd,
598                         rn,
599                         signed,
600                         from_bits,
601                         to_bits: std::cmp::min(64, to_bits),
602                     });
603                 }
604             } else if pass_through_lower {
605                 ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], I64));
606             }
607 
608             if output_ty == I128 {
609                 if signed {
610                     ctx.emit(Inst::AluRRImmShift {
611                         alu_op: ALUOp::Asr64,
612                         rd: dst.regs()[1],
613                         rn: dst.regs()[0].to_reg(),
614                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
615                     });
616                 } else {
617                     lower_constant_u64(ctx, dst.regs()[1], 0);
618                 }
619             }
620         }
621 
622         Opcode::Bnot => {
623             let out_regs = get_output_reg(ctx, outputs[0]);
624             let ty = ty.unwrap();
625             if ty == I128 {
626                 // TODO: We can merge this block with the one below once we support immlogic here
627                 let in_regs = put_input_in_regs(ctx, inputs[0]);
628                 ctx.emit(Inst::AluRRR {
629                     alu_op: ALUOp::OrrNot64,
630                     rd: out_regs.regs()[0],
631                     rn: zero_reg(),
632                     rm: in_regs.regs()[0],
633                 });
634                 ctx.emit(Inst::AluRRR {
635                     alu_op: ALUOp::OrrNot64,
636                     rd: out_regs.regs()[1],
637                     rn: zero_reg(),
638                     rm: in_regs.regs()[1],
639                 });
640             } else if !ty.is_vector() {
641                 let rd = out_regs.only_reg().unwrap();
642                 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
643                 let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
644                 // NOT rd, rm ==> ORR_NOT rd, zero, rm
645                 ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
646             } else {
647                 let rd = out_regs.only_reg().unwrap();
648                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
649                 ctx.emit(Inst::VecMisc {
650                     op: VecMisc2::Not,
651                     rd,
652                     rn: rm,
653                     size: VectorSize::from_ty(ty),
654                 });
655             }
656         }
657 
658         Opcode::Band
659         | Opcode::Bor
660         | Opcode::Bxor
661         | Opcode::BandNot
662         | Opcode::BorNot
663         | Opcode::BxorNot => {
664             let out_regs = get_output_reg(ctx, outputs[0]);
665             let ty = ty.unwrap();
666             if ty == I128 {
667                 // TODO: Support immlogic here
668                 let lhs = put_input_in_regs(ctx, inputs[0]);
669                 let rhs = put_input_in_regs(ctx, inputs[1]);
670                 let alu_op = match op {
671                     Opcode::Band => ALUOp::And64,
672                     Opcode::Bor => ALUOp::Orr64,
673                     Opcode::Bxor => ALUOp::Eor64,
674                     Opcode::BandNot => ALUOp::AndNot64,
675                     Opcode::BorNot => ALUOp::OrrNot64,
676                     Opcode::BxorNot => ALUOp::EorNot64,
677                     _ => unreachable!(),
678                 };
679 
680                 ctx.emit(Inst::AluRRR {
681                     alu_op,
682                     rd: out_regs.regs()[0],
683                     rn: lhs.regs()[0],
684                     rm: rhs.regs()[0],
685                 });
686                 ctx.emit(Inst::AluRRR {
687                     alu_op,
688                     rd: out_regs.regs()[1],
689                     rn: lhs.regs()[1],
690                     rm: rhs.regs()[1],
691                 });
692             } else if !ty.is_vector() {
693                 let rd = out_regs.only_reg().unwrap();
694                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
695                 let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
696                 let alu_op = match op {
697                     Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
698                     Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
699                     Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
700                     Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
701                     Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
702                     Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
703                     _ => unreachable!(),
704                 };
705                 ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
706             } else {
707                 let alu_op = match op {
708                     Opcode::Band => VecALUOp::And,
709                     Opcode::BandNot => VecALUOp::Bic,
710                     Opcode::Bor => VecALUOp::Orr,
711                     Opcode::Bxor => VecALUOp::Eor,
712                     _ => unreachable!(),
713                 };
714 
715                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
716                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
717                 let rd = out_regs.only_reg().unwrap();
718 
719                 ctx.emit(Inst::VecRRR {
720                     alu_op,
721                     rd,
722                     rn,
723                     rm,
724                     size: VectorSize::from_ty(ty),
725                 });
726             }
727         }
728 
729         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
730             let out_regs = get_output_reg(ctx, outputs[0]);
731             let ty = ty.unwrap();
732             if ty == I128 {
733                 let src = put_input_in_regs(ctx, inputs[0]);
734                 let amt = lower_shift_amt(ctx, inputs[1], ty, out_regs.regs()[0]).unwrap_reg();
735 
736                 match op {
737                     Opcode::Ishl => emit_shl_i128(ctx, src, out_regs, amt),
738                     Opcode::Ushr => {
739                         emit_shr_i128(ctx, src, out_regs, amt, /* is_signed = */ false)
740                     }
741                     Opcode::Sshr => {
742                         emit_shr_i128(ctx, src, out_regs, amt, /* is_signed = */ true)
743                     }
744                     _ => unreachable!(),
745                 };
746             } else if !ty.is_vector() {
747                 let rd = out_regs.only_reg().unwrap();
748                 let size = OperandSize::from_bits(ty_bits(ty));
749                 let narrow_mode = match (op, size) {
750                     (Opcode::Ishl, _) => NarrowValueMode::None,
751                     (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
752                     (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
753                     (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
754                     (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
755                     _ => unreachable!(),
756                 };
757                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
758                 let rm = lower_shift_amt(ctx, inputs[1], ty, out_regs.regs()[0]);
759                 let alu_op = match op {
760                     Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
761                     Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
762                     Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
763                     _ => unreachable!(),
764                 };
765                 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
766             } else {
767                 let rd = out_regs.only_reg().unwrap();
768                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
769                 let size = VectorSize::from_ty(ty);
770                 let (alu_op, is_right_shift) = match op {
771                     Opcode::Ishl => (VecALUOp::Sshl, false),
772                     Opcode::Ushr => (VecALUOp::Ushl, true),
773                     Opcode::Sshr => (VecALUOp::Sshl, true),
774                     _ => unreachable!(),
775                 };
776 
777                 let rm = if is_right_shift {
778                     // Right shifts are implemented with a negative left shift.
779                     let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
780                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
781                     let rn = zero_reg();
782                     ctx.emit(Inst::AluRRR {
783                         alu_op: ALUOp::Sub32,
784                         rd: tmp,
785                         rn,
786                         rm,
787                     });
788                     tmp.to_reg()
789                 } else {
790                     put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
791                 };
792 
793                 ctx.emit(Inst::VecDup { rd, rn: rm, size });
794 
795                 ctx.emit(Inst::VecRRR {
796                     alu_op,
797                     rd,
798                     rn,
799                     rm: rd.to_reg(),
800                     size,
801                 });
802             }
803         }
804 
805         Opcode::Rotr | Opcode::Rotl => {
806             // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
807             // effectively a right rotation of N - K places, if N is the integer's bit size. We
808             // implement left rotations with this trick.
809             //
810             // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
811             //
812             // For a < 32-bit rotate-right, we synthesize this as:
813             //
814             //    rotr rd, rn, rm
815             //
816             //       =>
817             //
818             //    zero-extend rn, <32-or-64>
819             //    and tmp_masked_rm, rm, <bitwidth - 1>
820             //    sub tmp1, tmp_masked_rm, <bitwidth>
821             //    sub tmp1, zero, tmp1  ; neg
822             //    lsr tmp2, rn, tmp_masked_rm
823             //    lsl rd, rn, tmp1
824             //    orr rd, rd, tmp2
825             //
826             // For a constant amount, we can instead do:
827             //
828             //    zero-extend rn, <32-or-64>
829             //    lsr tmp2, rn, #<shiftimm>
830             //    lsl rd, rn, <bitwidth - shiftimm>
831             //    orr rd, rd, tmp2
832 
833             let is_rotl = op == Opcode::Rotl;
834 
835             let ty = ty.unwrap();
836             let ty_bits_size = ty_bits(ty) as u8;
837 
838             // TODO: We can do much better codegen if we have a constant amt
839             if ty == I128 {
840                 let dst = get_output_reg(ctx, outputs[0]);
841                 let src = put_input_in_regs(ctx, inputs[0]);
842                 let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
843 
844                 let tmp = ctx.alloc_tmp(I128);
845                 let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
846 
847                 lower_constant_u64(ctx, inv_amt, 128);
848                 ctx.emit(Inst::AluRRR {
849                     alu_op: ALUOp::Sub64,
850                     rd: inv_amt,
851                     rn: inv_amt.to_reg(),
852                     rm: amt_src,
853                 });
854 
855                 if is_rotl {
856                     // rotl
857                     // (shl.i128 tmp, amt)
858                     // (ushr.i128 dst, 128-amt)
859 
860                     emit_shl_i128(ctx, src, tmp, amt_src);
861                     emit_shr_i128(
862                         ctx,
863                         src,
864                         dst,
865                         inv_amt.to_reg(),
866                         /* is_signed = */ false,
867                     );
868                 } else {
869                     // rotr
870                     // (ushr.i128 tmp, amt)
871                     // (shl.i128 dst, 128-amt)
872 
873                     emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
874                     emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
875                 }
876 
877                 ctx.emit(Inst::AluRRR {
878                     alu_op: ALUOp::Orr64,
879                     rd: dst.regs()[0],
880                     rn: dst.regs()[0].to_reg(),
881                     rm: tmp.regs()[0].to_reg(),
882                 });
883                 ctx.emit(Inst::AluRRR {
884                     alu_op: ALUOp::Orr64,
885                     rd: dst.regs()[1],
886                     rn: dst.regs()[1].to_reg(),
887                     rm: tmp.regs()[1].to_reg(),
888                 });
889 
890                 return Ok(());
891             }
892 
893             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
894             let rn = put_input_in_reg(
895                 ctx,
896                 inputs[0],
897                 if ty_bits_size <= 32 {
898                     NarrowValueMode::ZeroExtend32
899                 } else {
900                     NarrowValueMode::ZeroExtend64
901                 },
902             );
903             let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
904 
905             if ty_bits_size == 32 || ty_bits_size == 64 {
906                 let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
907                 match rm {
908                     ResultRegImmShift::ImmShift(mut immshift) => {
909                         if is_rotl {
910                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
911                         }
912                         immshift.imm &= ty_bits_size - 1;
913                         ctx.emit(Inst::AluRRImmShift {
914                             alu_op,
915                             rd,
916                             rn,
917                             immshift,
918                         });
919                     }
920 
921                     ResultRegImmShift::Reg(rm) => {
922                         let rm = if is_rotl {
923                             // Really ty_bits_size - rn, but the upper bits of the result are
924                             // ignored (because of the implicit masking done by the instruction),
925                             // so this is equivalent to negating the input.
926                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
927                             let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
928                             ctx.emit(Inst::AluRRR {
929                                 alu_op,
930                                 rd: tmp,
931                                 rn: zero_reg(),
932                                 rm,
933                             });
934                             tmp.to_reg()
935                         } else {
936                             rm
937                         };
938                         ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
939                     }
940                 }
941             } else {
942                 debug_assert!(ty_bits_size < 32);
943 
944                 match rm {
945                     ResultRegImmShift::Reg(reg) => {
946                         let reg = if is_rotl {
947                             // Really ty_bits_size - rn, but the upper bits of the result are
948                             // ignored (because of the implicit masking done by the instruction),
949                             // so this is equivalent to negating the input.
950                             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
951                             ctx.emit(Inst::AluRRR {
952                                 alu_op: ALUOp::Sub32,
953                                 rd: tmp,
954                                 rn: zero_reg(),
955                                 rm: reg,
956                             });
957                             tmp.to_reg()
958                         } else {
959                             reg
960                         };
961 
962                         // Explicitly mask the rotation count.
963                         let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
964                         ctx.emit(Inst::AluRRImmLogic {
965                             alu_op: ALUOp::And32,
966                             rd: tmp_masked_rm,
967                             rn: reg,
968                             imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
969                         });
970                         let tmp_masked_rm = tmp_masked_rm.to_reg();
971 
972                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
973                         let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
974                         ctx.emit(Inst::AluRRImm12 {
975                             alu_op: ALUOp::Sub32,
976                             rd: tmp1,
977                             rn: tmp_masked_rm,
978                             imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
979                         });
980                         ctx.emit(Inst::AluRRR {
981                             alu_op: ALUOp::Sub32,
982                             rd: tmp1,
983                             rn: zero_reg(),
984                             rm: tmp1.to_reg(),
985                         });
986                         ctx.emit(Inst::AluRRR {
987                             alu_op: ALUOp::Lsr32,
988                             rd: tmp2,
989                             rn,
990                             rm: tmp_masked_rm,
991                         });
992                         ctx.emit(Inst::AluRRR {
993                             alu_op: ALUOp::Lsl32,
994                             rd,
995                             rn,
996                             rm: tmp1.to_reg(),
997                         });
998                         ctx.emit(Inst::AluRRR {
999                             alu_op: ALUOp::Orr32,
1000                             rd,
1001                             rn: rd.to_reg(),
1002                             rm: tmp2.to_reg(),
1003                         });
1004                     }
1005 
1006                     ResultRegImmShift::ImmShift(mut immshift) => {
1007                         if is_rotl {
1008                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
1009                         }
1010                         immshift.imm &= ty_bits_size - 1;
1011 
1012                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
1013                         ctx.emit(Inst::AluRRImmShift {
1014                             alu_op: ALUOp::Lsr32,
1015                             rd: tmp1,
1016                             rn,
1017                             immshift: immshift.clone(),
1018                         });
1019 
1020                         let amount = immshift.value() & (ty_bits_size - 1);
1021                         let opp_shift =
1022                             ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
1023                         ctx.emit(Inst::AluRRImmShift {
1024                             alu_op: ALUOp::Lsl32,
1025                             rd,
1026                             rn,
1027                             immshift: opp_shift,
1028                         });
1029 
1030                         ctx.emit(Inst::AluRRR {
1031                             alu_op: ALUOp::Orr32,
1032                             rd,
1033                             rn: rd.to_reg(),
1034                             rm: tmp1.to_reg(),
1035                         });
1036                     }
1037                 }
1038             }
1039         }
1040 
1041         Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
1042             let ty = ty.unwrap();
1043             let op_ty = match ty {
1044                 I8 | I16 | I32 => I32,
1045                 I64 | I128 => I64,
1046                 _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
1047             };
1048             let bitop = match op {
1049                 Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
1050                 Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
1051                 _ => unreachable!(),
1052             };
1053 
1054             if ty == I128 {
1055                 let out_regs = get_output_reg(ctx, outputs[0]);
1056                 let in_regs = put_input_in_regs(ctx, inputs[0]);
1057 
1058                 let in_lo = in_regs.regs()[0];
1059                 let in_hi = in_regs.regs()[1];
1060                 let out_lo = out_regs.regs()[0];
1061                 let out_hi = out_regs.regs()[1];
1062 
1063                 if op == Opcode::Bitrev || op == Opcode::Ctz {
1064                     ctx.emit(Inst::BitRR {
1065                         rd: out_hi,
1066                         rn: in_lo,
1067                         op: bitop,
1068                     });
1069                     ctx.emit(Inst::BitRR {
1070                         rd: out_lo,
1071                         rn: in_hi,
1072                         op: bitop,
1073                     });
1074                 }
1075 
1076                 if op == Opcode::Ctz {
1077                     // We have reduced the problem to a clz by reversing the inputs previouly
1078                     emit_clz_i128(ctx, out_regs.map(|r| r.to_reg()), out_regs);
1079                 } else if op == Opcode::Clz {
1080                     emit_clz_i128(ctx, in_regs, out_regs);
1081                 } else if op == Opcode::Cls {
1082                     // cls out_hi, in_hi
1083                     // cls out_lo, in_lo
1084                     // eon sign_eq, in_hi, in_lo
1085                     // lsr sign_eq, sign_eq, #63
1086                     // madd out_lo, out_lo, sign_eq, sign_eq
1087                     // cmp out_hi, #63
1088                     // csel out_lo, out_lo, xzr, eq
1089                     // add  out_lo, out_lo, out_hi
1090                     // mov  out_hi, 0
1091 
1092                     let sign_eq = ctx.alloc_tmp(I64).only_reg().unwrap();
1093                     let xzr = writable_zero_reg();
1094 
1095                     ctx.emit(Inst::BitRR {
1096                         rd: out_lo,
1097                         rn: in_lo,
1098                         op: bitop,
1099                     });
1100                     ctx.emit(Inst::BitRR {
1101                         rd: out_hi,
1102                         rn: in_hi,
1103                         op: bitop,
1104                     });
1105                     ctx.emit(Inst::AluRRR {
1106                         alu_op: ALUOp::EorNot64,
1107                         rd: sign_eq,
1108                         rn: in_hi,
1109                         rm: in_lo,
1110                     });
1111                     ctx.emit(Inst::AluRRImmShift {
1112                         alu_op: ALUOp::Lsr64,
1113                         rd: sign_eq,
1114                         rn: sign_eq.to_reg(),
1115                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
1116                     });
1117                     ctx.emit(Inst::AluRRRR {
1118                         alu_op: ALUOp3::MAdd64,
1119                         rd: out_lo,
1120                         rn: out_lo.to_reg(),
1121                         rm: sign_eq.to_reg(),
1122                         ra: sign_eq.to_reg(),
1123                     });
1124                     ctx.emit(Inst::AluRRImm12 {
1125                         alu_op: ALUOp::SubS64,
1126                         rd: xzr,
1127                         rn: out_hi.to_reg(),
1128                         imm12: Imm12::maybe_from_u64(63).unwrap(),
1129                     });
1130                     ctx.emit(Inst::CSel {
1131                         cond: Cond::Eq,
1132                         rd: out_lo,
1133                         rn: out_lo.to_reg(),
1134                         rm: xzr.to_reg(),
1135                     });
1136                     ctx.emit(Inst::AluRRR {
1137                         alu_op: ALUOp::Add64,
1138                         rd: out_lo,
1139                         rn: out_lo.to_reg(),
1140                         rm: out_hi.to_reg(),
1141                     });
1142                     lower_constant_u64(ctx, out_hi, 0);
1143                 }
1144             } else {
1145                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1146                 let needs_zext = match op {
1147                     Opcode::Bitrev | Opcode::Ctz => false,
1148                     Opcode::Clz | Opcode::Cls => true,
1149                     _ => unreachable!(),
1150                 };
1151                 let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
1152                     NarrowValueMode::ZeroExtend64
1153                 } else if needs_zext {
1154                     NarrowValueMode::ZeroExtend32
1155                 } else {
1156                     NarrowValueMode::None
1157                 };
1158                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1159 
1160                 ctx.emit(Inst::BitRR { rd, rn, op: bitop });
1161 
1162                 // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
1163                 // to a clz, and bitrev as the main operation.
1164                 if op == Opcode::Bitrev || op == Opcode::Ctz {
1165                     // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
1166                     // the reversed result in the highest n bits, so we need to shift them down into
1167                     // place.
1168                     let right_shift = match ty {
1169                         I8 => Some(24),
1170                         I16 => Some(16),
1171                         I32 => None,
1172                         I64 => None,
1173                         _ => panic!("Unsupported type for Bitrev"),
1174                     };
1175                     if let Some(s) = right_shift {
1176                         ctx.emit(Inst::AluRRImmShift {
1177                             alu_op: ALUOp::Lsr32,
1178                             rd,
1179                             rn: rd.to_reg(),
1180                             immshift: ImmShift::maybe_from_u64(s).unwrap(),
1181                         });
1182                     }
1183                 }
1184 
1185                 if op == Opcode::Ctz {
1186                     ctx.emit(Inst::BitRR {
1187                         op: BitOp::from((Opcode::Clz, op_ty)),
1188                         rd,
1189                         rn: rd.to_reg(),
1190                     });
1191                 }
1192             }
1193         }
1194 
1195         Opcode::Popcnt => {
1196             let ty = ty.unwrap();
1197 
1198             if ty.is_vector() {
1199                 let lane_type = ty.lane_type();
1200                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1201                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1202 
1203                 if lane_type != I8 {
1204                     return Err(CodegenError::Unsupported(format!(
1205                         "Unsupported SIMD vector lane type: {:?}",
1206                         lane_type
1207                     )));
1208                 }
1209 
1210                 ctx.emit(Inst::VecMisc {
1211                     op: VecMisc2::Cnt,
1212                     rd,
1213                     rn,
1214                     size: VectorSize::from_ty(ty),
1215                 });
1216             } else {
1217                 let out_regs = get_output_reg(ctx, outputs[0]);
1218                 let in_regs = put_input_in_regs(ctx, inputs[0]);
1219                 let size = if ty == I128 {
1220                     ScalarSize::Size64
1221                 } else {
1222                     ScalarSize::from_operand_size(OperandSize::from_ty(ty))
1223                 };
1224 
1225                 let vec_size = if ty == I128 {
1226                     VectorSize::Size8x16
1227                 } else {
1228                     VectorSize::Size8x8
1229                 };
1230 
1231                 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
1232 
1233                 // fmov tmp, in_lo
1234                 // if ty == i128:
1235                 //     mov tmp.d[1], in_hi
1236                 //
1237                 // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
1238                 // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
1239                 //
1240                 // umov out_lo, tmp.b[0]
1241                 // if ty == i128:
1242                 //     mov out_hi, 0
1243 
1244                 ctx.emit(Inst::MovToFpu {
1245                     rd: tmp,
1246                     rn: in_regs.regs()[0],
1247                     size,
1248                 });
1249 
1250                 if ty == I128 {
1251                     ctx.emit(Inst::MovToVec {
1252                         rd: tmp,
1253                         rn: in_regs.regs()[1],
1254                         idx: 1,
1255                         size: VectorSize::Size64x2,
1256                     });
1257                 }
1258 
1259                 ctx.emit(Inst::VecMisc {
1260                     op: VecMisc2::Cnt,
1261                     rd: tmp,
1262                     rn: tmp.to_reg(),
1263                     size: vec_size,
1264                 });
1265 
1266                 match ScalarSize::from_ty(ty) {
1267                     ScalarSize::Size8 => {}
1268                     ScalarSize::Size16 => {
1269                         // ADDP is usually cheaper than ADDV.
1270                         ctx.emit(Inst::VecRRR {
1271                             alu_op: VecALUOp::Addp,
1272                             rd: tmp,
1273                             rn: tmp.to_reg(),
1274                             rm: tmp.to_reg(),
1275                             size: VectorSize::Size8x8,
1276                         });
1277                     }
1278                     ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => {
1279                         ctx.emit(Inst::VecLanes {
1280                             op: VecLanesOp::Addv,
1281                             rd: tmp,
1282                             rn: tmp.to_reg(),
1283                             size: vec_size,
1284                         });
1285                     }
1286                 }
1287 
1288                 ctx.emit(Inst::MovFromVec {
1289                     rd: out_regs.regs()[0],
1290                     rn: tmp.to_reg(),
1291                     idx: 0,
1292                     size: VectorSize::Size8x16,
1293                 });
1294 
1295                 if ty == I128 {
1296                     lower_constant_u64(ctx, out_regs.regs()[1], 0);
1297                 }
1298             }
1299         }
1300 
1301         Opcode::Load
1302         | Opcode::Uload8
1303         | Opcode::Sload8
1304         | Opcode::Uload16
1305         | Opcode::Sload16
1306         | Opcode::Uload32
1307         | Opcode::Sload32
1308         | Opcode::LoadComplex
1309         | Opcode::Uload8Complex
1310         | Opcode::Sload8Complex
1311         | Opcode::Uload16Complex
1312         | Opcode::Sload16Complex
1313         | Opcode::Uload32Complex
1314         | Opcode::Sload32Complex
1315         | Opcode::Sload8x8
1316         | Opcode::Uload8x8
1317         | Opcode::Sload16x4
1318         | Opcode::Uload16x4
1319         | Opcode::Sload32x2
1320         | Opcode::Uload32x2
1321         | Opcode::Uload8x8Complex
1322         | Opcode::Sload8x8Complex
1323         | Opcode::Uload16x4Complex
1324         | Opcode::Sload16x4Complex
1325         | Opcode::Uload32x2Complex
1326         | Opcode::Sload32x2Complex => {
1327             let sign_extend = match op {
1328                 Opcode::Sload8
1329                 | Opcode::Sload8Complex
1330                 | Opcode::Sload16
1331                 | Opcode::Sload16Complex
1332                 | Opcode::Sload32
1333                 | Opcode::Sload32Complex => true,
1334                 _ => false,
1335             };
1336             let flags = ctx
1337                 .memflags(insn)
1338                 .expect("Load instruction should have memflags");
1339 
1340             let out_ty = ctx.output_ty(insn, 0);
1341             if out_ty == I128 {
1342                 let off = ctx.data(insn).load_store_offset().unwrap();
1343                 let mem = lower_pair_address(ctx, &inputs[..], off);
1344                 let dst = get_output_reg(ctx, outputs[0]);
1345                 ctx.emit(Inst::LoadP64 {
1346                     rt: dst.regs()[0],
1347                     rt2: dst.regs()[1],
1348                     mem,
1349                     flags,
1350                 });
1351             } else {
1352                 lower_load(
1353                     ctx,
1354                     insn,
1355                     &inputs[..],
1356                     outputs[0],
1357                     |ctx, dst, elem_ty, mem| {
1358                         let rd = dst.only_reg().unwrap();
1359                         let is_float = ty_has_float_or_vec_representation(elem_ty);
1360                         ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
1361                             (1, _, _) => Inst::ULoad8 { rd, mem, flags },
1362                             (8, false, _) => Inst::ULoad8 { rd, mem, flags },
1363                             (8, true, _) => Inst::SLoad8 { rd, mem, flags },
1364                             (16, false, _) => Inst::ULoad16 { rd, mem, flags },
1365                             (16, true, _) => Inst::SLoad16 { rd, mem, flags },
1366                             (32, false, false) => Inst::ULoad32 { rd, mem, flags },
1367                             (32, true, false) => Inst::SLoad32 { rd, mem, flags },
1368                             (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
1369                             (64, _, false) => Inst::ULoad64 { rd, mem, flags },
1370                             // Note that we treat some of the vector loads as scalar floating-point loads,
1371                             // which is correct in a little endian environment.
1372                             (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
1373                             (128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
1374                             _ => panic!("Unsupported size in load"),
1375                         });
1376 
1377                         let vec_extend = match op {
1378                             Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
1379                             Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
1380                             Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
1381                             Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
1382                             Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
1383                             Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
1384                             Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
1385                             Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
1386                             Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
1387                             Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
1388                             Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
1389                             Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
1390                             _ => None,
1391                         };
1392 
1393                         if let Some(t) = vec_extend {
1394                             let rd = dst.only_reg().unwrap();
1395                             ctx.emit(Inst::VecExtend {
1396                                 t,
1397                                 rd,
1398                                 rn: rd.to_reg(),
1399                                 high_half: false,
1400                             });
1401                         }
1402                     },
1403                 );
1404             }
1405         }
1406 
1407         Opcode::Store
1408         | Opcode::Istore8
1409         | Opcode::Istore16
1410         | Opcode::Istore32
1411         | Opcode::StoreComplex
1412         | Opcode::Istore8Complex
1413         | Opcode::Istore16Complex
1414         | Opcode::Istore32Complex => {
1415             let off = ctx.data(insn).load_store_offset().unwrap();
1416             let elem_ty = match op {
1417                 Opcode::Istore8 | Opcode::Istore8Complex => I8,
1418                 Opcode::Istore16 | Opcode::Istore16Complex => I16,
1419                 Opcode::Istore32 | Opcode::Istore32Complex => I32,
1420                 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
1421                 _ => unreachable!(),
1422             };
1423             let is_float = ty_has_float_or_vec_representation(elem_ty);
1424             let flags = ctx
1425                 .memflags(insn)
1426                 .expect("Store instruction should have memflags");
1427 
1428             let dst = put_input_in_regs(ctx, inputs[0]);
1429 
1430             if elem_ty == I128 {
1431                 let mem = lower_pair_address(ctx, &inputs[1..], off);
1432                 ctx.emit(Inst::StoreP64 {
1433                     rt: dst.regs()[0],
1434                     rt2: dst.regs()[1],
1435                     mem,
1436                     flags,
1437                 });
1438             } else {
1439                 let rd = dst.only_reg().unwrap();
1440                 let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
1441                 ctx.emit(match (ty_bits(elem_ty), is_float) {
1442                     (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
1443                     (16, _) => Inst::Store16 { rd, mem, flags },
1444                     (32, false) => Inst::Store32 { rd, mem, flags },
1445                     (32, true) => Inst::FpuStore32 { rd, mem, flags },
1446                     (64, false) => Inst::Store64 { rd, mem, flags },
1447                     (64, true) => Inst::FpuStore64 { rd, mem, flags },
1448                     (128, _) => Inst::FpuStore128 { rd, mem, flags },
1449                     _ => panic!("Unsupported size in store"),
1450                 });
1451             }
1452         }
1453 
1454         Opcode::StackAddr => {
1455             let (stack_slot, offset) = match *ctx.data(insn) {
1456                 InstructionData::StackLoad {
1457                     opcode: Opcode::StackAddr,
1458                     stack_slot,
1459                     offset,
1460                 } => (stack_slot, offset),
1461                 _ => unreachable!(),
1462             };
1463             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1464             let offset: i32 = offset.into();
1465             let inst = ctx
1466                 .abi()
1467                 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
1468             ctx.emit(inst);
1469         }
1470 
1471         Opcode::AtomicRmw => {
1472             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1473             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1474             let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1475             let ty_access = ty.unwrap();
1476             assert!(is_valid_atomic_transaction_ty(ty_access));
1477             // Make sure that both args are in virtual regs, since in effect
1478             // we have to do a parallel copy to get them safely to the AtomicRMW input
1479             // regs, and that's not guaranteed safe if either is in a real reg.
1480             r_addr = ctx.ensure_in_vreg(r_addr, I64);
1481             r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
1482             // Move the args to the preordained AtomicRMW input regs
1483             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1484             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
1485             // Now the AtomicRMW insn itself
1486             let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
1487             ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
1488             // And finally, copy the preordained AtomicRMW output reg to its destination.
1489             ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1490             // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
1491         }
1492 
1493         Opcode::AtomicCas => {
1494             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1495             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1496             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1497             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1498             let ty_access = ty.unwrap();
1499             assert!(is_valid_atomic_transaction_ty(ty_access));
1500 
1501             if isa_flags.use_lse() {
1502                 ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
1503                 ctx.emit(Inst::AtomicCAS {
1504                     rs: r_dst,
1505                     rt: r_replacement,
1506                     rn: r_addr,
1507                     ty: ty_access,
1508                 });
1509             } else {
1510                 // This is very similar to, but not identical to, the AtomicRmw case.  Note
1511                 // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
1512                 // about zero-extending narrow (I8/I16/I32) values here.
1513                 // Make sure that all three args are in virtual regs.  See corresponding comment
1514                 // for `Opcode::AtomicRmw` above.
1515                 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1516                 r_expected = ctx.ensure_in_vreg(r_expected, I64);
1517                 r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
1518                 // Move the args to the preordained AtomicCASLoop input regs
1519                 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1520                 ctx.emit(Inst::gen_move(
1521                     Writable::from_reg(xreg(26)),
1522                     r_expected,
1523                     I64,
1524                 ));
1525                 ctx.emit(Inst::gen_move(
1526                     Writable::from_reg(xreg(28)),
1527                     r_replacement,
1528                     I64,
1529                 ));
1530                 // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
1531                 ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
1532                 // And finally, copy the preordained AtomicCASLoop output reg to its destination.
1533                 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1534                 // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
1535             }
1536         }
1537 
1538         Opcode::AtomicLoad => {
1539             let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1540             emit_atomic_load(ctx, rt, insn);
1541         }
1542 
1543         Opcode::AtomicStore => {
1544             let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1545             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1546             let access_ty = ctx.input_ty(insn, 0);
1547             assert!(is_valid_atomic_transaction_ty(access_ty));
1548             ctx.emit(Inst::StoreRelease { access_ty, rt, rn });
1549         }
1550 
1551         Opcode::Fence => {
1552             ctx.emit(Inst::Fence {});
1553         }
1554 
1555         Opcode::StackLoad | Opcode::StackStore => {
1556             panic!("Direct stack memory access not supported; should not be used by Wasm");
1557         }
1558 
1559         Opcode::HeapAddr => {
1560             panic!("heap_addr should have been removed by legalization!");
1561         }
1562 
1563         Opcode::TableAddr => {
1564             panic!("table_addr should have been removed by legalization!");
1565         }
1566 
1567         Opcode::Nop => {
1568             // Nothing.
1569         }
1570 
1571         Opcode::Select => {
1572             let flag_input = inputs[0];
1573             let cond = if let Some(icmp_insn) =
1574                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
1575             {
1576                 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
1577                 lower_icmp(ctx, icmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond()
1578             } else if let Some(fcmp_insn) =
1579                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
1580             {
1581                 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
1582                 let cond = lower_fp_condcode(condcode);
1583                 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
1584                 cond
1585             } else {
1586                 let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
1587                     (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
1588                 } else {
1589                     (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
1590                 };
1591 
1592                 let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
1593                 // cmp rcond, #0
1594                 ctx.emit(Inst::AluRRR {
1595                     alu_op: cmp_op,
1596                     rd: writable_zero_reg(),
1597                     rn: rcond,
1598                     rm: zero_reg(),
1599                 });
1600                 Cond::Ne
1601             };
1602 
1603             // csel.cond rd, rn, rm
1604             let ty = ctx.output_ty(insn, 0);
1605             let bits = ty_bits(ty);
1606             let is_float = ty_has_float_or_vec_representation(ty);
1607 
1608             let dst = get_output_reg(ctx, outputs[0]);
1609             let lhs = put_input_in_regs(ctx, inputs[1]);
1610             let rhs = put_input_in_regs(ctx, inputs[2]);
1611 
1612             let rd = dst.regs()[0];
1613             let rn = lhs.regs()[0];
1614             let rm = rhs.regs()[0];
1615 
1616             match (is_float, bits) {
1617                 (true, 32) => ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }),
1618                 (true, 64) => ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }),
1619                 (true, 128) => ctx.emit(Inst::VecCSel { cond, rd, rn, rm }),
1620                 (false, 128) => {
1621                     ctx.emit(Inst::CSel {
1622                         cond,
1623                         rd: dst.regs()[0],
1624                         rn: lhs.regs()[0],
1625                         rm: rhs.regs()[0],
1626                     });
1627                     ctx.emit(Inst::CSel {
1628                         cond,
1629                         rd: dst.regs()[1],
1630                         rn: lhs.regs()[1],
1631                         rm: rhs.regs()[1],
1632                     });
1633                 }
1634                 (_, _) => ctx.emit(Inst::CSel { cond, rd, rn, rm }),
1635             }
1636         }
1637 
1638         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
1639             let condcode = ctx.data(insn).cond_code().unwrap();
1640             // Verification ensures that the input is always a
1641             // single-def ifcmp.
1642             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1643             let cond = lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
1644 
1645             // csel.COND rd, rn, rm
1646             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1647             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1648             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1649             let ty = ctx.output_ty(insn, 0);
1650             let bits = ty_bits(ty);
1651             let is_float = ty_has_float_or_vec_representation(ty);
1652             if is_float && bits == 32 {
1653                 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1654             } else if is_float && bits == 64 {
1655                 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1656             } else {
1657                 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1658             }
1659         }
1660 
1661         Opcode::Bitselect | Opcode::Vselect => {
1662             let ty = ty.unwrap();
1663             if !ty.is_vector() {
1664                 debug_assert_ne!(Opcode::Vselect, op);
1665                 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1666                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1667                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1668                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1669                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1670                 // AND rTmp, rn, rcond
1671                 ctx.emit(Inst::AluRRR {
1672                     alu_op: ALUOp::And64,
1673                     rd: tmp,
1674                     rn,
1675                     rm: rcond,
1676                 });
1677                 // BIC rd, rm, rcond
1678                 ctx.emit(Inst::AluRRR {
1679                     alu_op: ALUOp::AndNot64,
1680                     rd,
1681                     rn: rm,
1682                     rm: rcond,
1683                 });
1684                 // ORR rd, rd, rTmp
1685                 ctx.emit(Inst::AluRRR {
1686                     alu_op: ALUOp::Orr64,
1687                     rd,
1688                     rn: rd.to_reg(),
1689                     rm: tmp.to_reg(),
1690                 });
1691             } else {
1692                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1693                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1694                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1695                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1696                 ctx.emit(Inst::gen_move(rd, rcond, ty));
1697 
1698                 ctx.emit(Inst::VecRRR {
1699                     alu_op: VecALUOp::Bsl,
1700                     rd,
1701                     rn,
1702                     rm,
1703                     size: VectorSize::from_ty(ty),
1704                 });
1705             }
1706         }
1707 
1708         Opcode::Trueif => {
1709             let condcode = ctx.data(insn).cond_code().unwrap();
1710             // Verification ensures that the input is always a
1711             // single-def ifcmp.
1712             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1713             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1714             lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::Register(rd))?;
1715         }
1716 
1717         Opcode::Trueff => {
1718             let condcode = ctx.data(insn).fp_cond_code().unwrap();
1719             let cond = lower_fp_condcode(condcode);
1720             let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1721             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1722             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1723             materialize_bool_result(ctx, insn, rd, cond);
1724         }
1725 
1726         Opcode::IsNull | Opcode::IsInvalid => {
1727             // Null references are represented by the constant value 0; invalid references are
1728             // represented by the constant value -1. See `define_reftypes()` in
1729             // `meta/src/isa/x86/encodings.rs` to confirm.
1730             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1731             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1732             let ty = ctx.input_ty(insn, 0);
1733             let (alu_op, const_value) = match op {
1734                 Opcode::IsNull => {
1735                     // cmp rn, #0
1736                     (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
1737                 }
1738                 Opcode::IsInvalid => {
1739                     // cmn rn, #1
1740                     (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
1741                 }
1742                 _ => unreachable!(),
1743             };
1744             let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
1745             ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
1746             materialize_bool_result(ctx, insn, rd, Cond::Eq);
1747         }
1748 
1749         Opcode::Copy => {
1750             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1751             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1752             let ty = ctx.input_ty(insn, 0);
1753             ctx.emit(Inst::gen_move(rd, rn, ty));
1754         }
1755 
1756         Opcode::Breduce | Opcode::Ireduce => {
1757             // Smaller integers/booleans are stored with high-order bits
1758             // undefined, so we can simply do a copy.
1759             let rn = put_input_in_regs(ctx, inputs[0]).regs()[0];
1760             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1761             let ty = ctx.input_ty(insn, 0);
1762             ctx.emit(Inst::gen_move(rd, rn, ty));
1763         }
1764 
1765         Opcode::Bextend | Opcode::Bmask => {
1766             // Bextend and Bmask both simply sign-extend. This works for:
1767             // - Bextend, because booleans are stored as 0 / -1, so we
1768             //   sign-extend the -1 to a -1 in the wider width.
1769             // - Bmask, because the resulting integer mask value must be
1770             //   all-ones (-1) if the argument is true.
1771 
1772             let from_ty = ctx.input_ty(insn, 0);
1773             let to_ty = ctx.output_ty(insn, 0);
1774             let from_bits = ty_bits(from_ty);
1775             let to_bits = ty_bits(to_ty);
1776 
1777             assert!(
1778                 from_bits <= 64 && to_bits <= 64,
1779                 "Vector Bextend not supported yet"
1780             );
1781             assert!(from_bits <= to_bits);
1782 
1783             if from_bits == to_bits {
1784                 // Nothing.
1785             } else {
1786                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1787                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1788                 let to_bits = if to_bits == 64 {
1789                     64
1790                 } else {
1791                     assert!(to_bits <= 32);
1792                     32
1793                 };
1794                 let from_bits = from_bits as u8;
1795                 ctx.emit(Inst::Extend {
1796                     rd,
1797                     rn,
1798                     signed: true,
1799                     from_bits,
1800                     to_bits,
1801                 });
1802             }
1803         }
1804 
1805         Opcode::Bint => {
1806             // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
1807             // out the LSB to give a 0 / 1-valued integer result.
1808             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1809             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1810             let output_bits = ty_bits(ctx.output_ty(insn, 0));
1811 
1812             let (imm_ty, alu_op) = if output_bits > 32 {
1813                 (I64, ALUOp::And64)
1814             } else {
1815                 (I32, ALUOp::And32)
1816             };
1817             ctx.emit(Inst::AluRRImmLogic {
1818                 alu_op,
1819                 rd,
1820                 rn,
1821                 imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
1822             });
1823         }
1824 
1825         Opcode::Bitcast => {
1826             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1827             let ity = ctx.input_ty(insn, 0);
1828             let oty = ctx.output_ty(insn, 0);
1829             let ity_bits = ty_bits(ity);
1830             let ity_vec_reg = ty_has_float_or_vec_representation(ity);
1831             let oty_bits = ty_bits(oty);
1832             let oty_vec_reg = ty_has_float_or_vec_representation(oty);
1833 
1834             debug_assert_eq!(ity_bits, oty_bits);
1835 
1836             match (ity_vec_reg, oty_vec_reg) {
1837                 (true, true) => {
1838                     let narrow_mode = if ity_bits <= 32 {
1839                         NarrowValueMode::ZeroExtend32
1840                     } else {
1841                         NarrowValueMode::ZeroExtend64
1842                     };
1843                     let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
1844                     ctx.emit(Inst::gen_move(rd, rm, oty));
1845                 }
1846                 (false, false) => {
1847                     let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1848                     ctx.emit(Inst::gen_move(rd, rm, oty));
1849                 }
1850                 (false, true) => {
1851                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1852                     ctx.emit(Inst::MovToFpu {
1853                         rd,
1854                         rn,
1855                         size: ScalarSize::Size64,
1856                     });
1857                 }
1858                 (true, false) => {
1859                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1860                     let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
1861 
1862                     ctx.emit(Inst::MovFromVec {
1863                         rd,
1864                         rn,
1865                         idx: 0,
1866                         size,
1867                     });
1868                 }
1869             }
1870         }
1871 
1872         Opcode::FallthroughReturn | Opcode::Return => {
1873             for (i, input) in inputs.iter().enumerate() {
1874                 // N.B.: according to the AArch64 ABI, the top bits of a register
1875                 // (above the bits for the value's type) are undefined, so we
1876                 // need not extend the return values.
1877                 let src_regs = put_input_in_regs(ctx, *input);
1878                 let retval_regs = ctx.retval(i);
1879 
1880                 assert_eq!(src_regs.len(), retval_regs.len());
1881                 let ty = ctx.input_ty(insn, i);
1882                 let (_, tys) = Inst::rc_for_type(ty)?;
1883 
1884                 src_regs
1885                     .regs()
1886                     .iter()
1887                     .zip(retval_regs.regs().iter())
1888                     .zip(tys.iter())
1889                     .for_each(|((&src, &dst), &ty)| {
1890                         ctx.emit(Inst::gen_move(dst, src, ty));
1891                     });
1892             }
1893             // N.B.: the Ret itself is generated by the ABI.
1894         }
1895 
1896         Opcode::Ifcmp | Opcode::Ffcmp => {
1897             // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
1898             // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
1899             // the same block, or a dominating block. In other words, it cannot pass through a BB
1900             // param (phi). The flags pass of the verifier will ensure this.
1901             panic!("Should never reach ifcmp as isel root!");
1902         }
1903 
1904         Opcode::Icmp => {
1905             let condcode = ctx.data(insn).cond_code().unwrap();
1906             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1907             lower_icmp(ctx, insn, condcode, IcmpOutput::Register(rd))?;
1908         }
1909 
1910         Opcode::Fcmp => {
1911             let condcode = ctx.data(insn).fp_cond_code().unwrap();
1912             let cond = lower_fp_condcode(condcode);
1913             let ty = ctx.input_ty(insn, 0);
1914             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1915             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1916             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1917 
1918             if !ty.is_vector() {
1919                 match ty_bits(ty) {
1920                     32 => {
1921                         ctx.emit(Inst::FpuCmp32 { rn, rm });
1922                     }
1923                     64 => {
1924                         ctx.emit(Inst::FpuCmp64 { rn, rm });
1925                     }
1926                     _ => panic!("Bad float size"),
1927                 }
1928                 materialize_bool_result(ctx, insn, rd, cond);
1929             } else {
1930                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1931             }
1932         }
1933 
1934         Opcode::JumpTableEntry | Opcode::JumpTableBase => {
1935             panic!("Should not appear: we handle BrTable directly");
1936         }
1937 
1938         Opcode::Debugtrap => {
1939             ctx.emit(Inst::Brk);
1940         }
1941 
1942         Opcode::Trap | Opcode::ResumableTrap => {
1943             let trap_code = ctx.data(insn).trap_code().unwrap();
1944             ctx.emit_safepoint(Inst::Udf { trap_code });
1945         }
1946 
1947         Opcode::Trapif | Opcode::Trapff => {
1948             let trap_code = ctx.data(insn).trap_code().unwrap();
1949 
1950             let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
1951                 let condcode = ctx.data(insn).cond_code().unwrap();
1952                 let cond = lower_condcode(condcode);
1953                 // The flags must not have been clobbered by any other
1954                 // instruction between the iadd_ifcout and this instruction, as
1955                 // verified by the CLIF validator; so we can simply use the
1956                 // flags here.
1957                 cond
1958             } else if op == Opcode::Trapif {
1959                 let condcode = ctx.data(insn).cond_code().unwrap();
1960 
1961                 // Verification ensures that the input is always a single-def ifcmp.
1962                 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1963                 lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond()
1964             } else {
1965                 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1966                 let cond = lower_fp_condcode(condcode);
1967 
1968                 // Verification ensures that the input is always a
1969                 // single-def ffcmp.
1970                 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1971                 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1972                 cond
1973             };
1974 
1975             ctx.emit_safepoint(Inst::TrapIf {
1976                 trap_code,
1977                 kind: CondBrKind::Cond(cond),
1978             });
1979         }
1980 
1981         Opcode::Safepoint => {
1982             panic!("safepoint instructions not used by new backend's safepoints!");
1983         }
1984 
1985         Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
1986             panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
1987         }
1988 
1989         Opcode::FuncAddr => {
1990             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1991             let (extname, _) = ctx.call_target(insn).unwrap();
1992             let extname = extname.clone();
1993             ctx.emit(Inst::LoadExtName {
1994                 rd,
1995                 name: Box::new(extname),
1996                 offset: 0,
1997             });
1998         }
1999 
2000         Opcode::GlobalValue => {
2001             panic!("global_value should have been removed by legalization!");
2002         }
2003 
2004         Opcode::SymbolValue => {
2005             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2006             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
2007             let extname = extname.clone();
2008             ctx.emit(Inst::LoadExtName {
2009                 rd,
2010                 name: Box::new(extname),
2011                 offset,
2012             });
2013         }
2014 
2015         Opcode::Call | Opcode::CallIndirect => {
2016             let caller_conv = ctx.abi().call_conv();
2017             let (mut abi, inputs) = match op {
2018                 Opcode::Call => {
2019                     let (extname, dist) = ctx.call_target(insn).unwrap();
2020                     let extname = extname.clone();
2021                     let sig = ctx.call_sig(insn).unwrap();
2022                     assert!(inputs.len() == sig.params.len());
2023                     assert!(outputs.len() == sig.returns.len());
2024                     (
2025                         AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
2026                         &inputs[..],
2027                     )
2028                 }
2029                 Opcode::CallIndirect => {
2030                     let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
2031                     let sig = ctx.call_sig(insn).unwrap();
2032                     assert!(inputs.len() - 1 == sig.params.len());
2033                     assert!(outputs.len() == sig.returns.len());
2034                     (
2035                         AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
2036                         &inputs[1..],
2037                     )
2038                 }
2039                 _ => unreachable!(),
2040             };
2041 
2042             abi.emit_stack_pre_adjust(ctx);
2043             assert!(inputs.len() == abi.num_args());
2044             for i in abi.get_copy_to_arg_order() {
2045                 let input = inputs[i];
2046                 let arg_regs = put_input_in_regs(ctx, input);
2047                 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
2048             }
2049             abi.emit_call(ctx);
2050             for (i, output) in outputs.iter().enumerate() {
2051                 let retval_regs = get_output_reg(ctx, *output);
2052                 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
2053             }
2054             abi.emit_stack_post_adjust(ctx);
2055         }
2056 
2057         Opcode::GetPinnedReg => {
2058             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2059             ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
2060         }
2061 
2062         Opcode::SetPinnedReg => {
2063             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2064             ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
2065         }
2066 
2067         Opcode::Spill
2068         | Opcode::Fill
2069         | Opcode::FillNop
2070         | Opcode::Regmove
2071         | Opcode::CopySpecial
2072         | Opcode::CopyToSsa
2073         | Opcode::CopyNop
2074         | Opcode::AdjustSpDown
2075         | Opcode::AdjustSpUpImm
2076         | Opcode::AdjustSpDownImm
2077         | Opcode::IfcmpSp
2078         | Opcode::Regspill
2079         | Opcode::Regfill => {
2080             panic!("Unused opcode should not be encountered.");
2081         }
2082 
2083         Opcode::Jump
2084         | Opcode::Fallthrough
2085         | Opcode::Brz
2086         | Opcode::Brnz
2087         | Opcode::BrIcmp
2088         | Opcode::Brif
2089         | Opcode::Brff
2090         | Opcode::IndirectJumpTableBr
2091         | Opcode::BrTable => {
2092             panic!("Branch opcode reached non-branch lowering logic!");
2093         }
2094 
2095         Opcode::Vconst => {
2096             let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
2097             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2098             lower_constant_f128(ctx, rd, value);
2099         }
2100 
2101         Opcode::RawBitcast => {
2102             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2103             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2104             let ty = ctx.input_ty(insn, 0);
2105             ctx.emit(Inst::gen_move(rd, rm, ty));
2106         }
2107 
2108         Opcode::Extractlane => {
2109             if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
2110                 let idx = *imm;
2111                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2112                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2113                 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
2114                 let ty = ty.unwrap();
2115 
2116                 if ty_has_int_representation(ty) {
2117                     ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
2118                 // Plain moves are faster on some processors.
2119                 } else if idx == 0 {
2120                     ctx.emit(Inst::gen_move(rd, rn, ty));
2121                 } else {
2122                     ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
2123                 }
2124             } else {
2125                 unreachable!();
2126             }
2127         }
2128 
2129         Opcode::Insertlane => {
2130             let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
2131                 *imm
2132             } else {
2133                 unreachable!();
2134             };
2135             let input_ty = ctx.input_ty(insn, 1);
2136             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2137             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2138             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2139             let ty = ty.unwrap();
2140             let size = VectorSize::from_ty(ty);
2141 
2142             ctx.emit(Inst::gen_move(rd, rm, ty));
2143 
2144             if ty_has_int_representation(input_ty) {
2145                 ctx.emit(Inst::MovToVec { rd, rn, idx, size });
2146             } else {
2147                 ctx.emit(Inst::VecMovElement {
2148                     rd,
2149                     rn,
2150                     dest_idx: idx,
2151                     src_idx: 0,
2152                     size,
2153                 });
2154             }
2155         }
2156 
2157         Opcode::Splat => {
2158             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2159             let size = VectorSize::from_ty(ty.unwrap());
2160 
2161             if let Some((_, insn)) = maybe_input_insn_multi(
2162                 ctx,
2163                 inputs[0],
2164                 &[
2165                     Opcode::Bconst,
2166                     Opcode::F32const,
2167                     Opcode::F64const,
2168                     Opcode::Iconst,
2169                 ],
2170             ) {
2171                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2172             } else if let Some(insn) =
2173                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
2174             {
2175                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2176             } else if let Some(insn) =
2177                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
2178             {
2179                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2180             } else if let Some((_, insn)) = maybe_input_insn_multi(
2181                 ctx,
2182                 inputs[0],
2183                 &[
2184                     Opcode::Uload8,
2185                     Opcode::Sload8,
2186                     Opcode::Uload16,
2187                     Opcode::Sload16,
2188                     Opcode::Uload32,
2189                     Opcode::Sload32,
2190                     Opcode::Load,
2191                 ],
2192             ) {
2193                 ctx.sink_inst(insn);
2194                 let load_inputs = insn_inputs(ctx, insn);
2195                 let load_outputs = insn_outputs(ctx, insn);
2196                 lower_load(
2197                     ctx,
2198                     insn,
2199                     &load_inputs[..],
2200                     load_outputs[0],
2201                     |ctx, _rd, _elem_ty, mem| {
2202                         let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
2203                         let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
2204                         if let Some(addr_inst) = addr_inst {
2205                             ctx.emit(addr_inst);
2206                         }
2207                         ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
2208                     },
2209                 );
2210             } else {
2211                 let input_ty = ctx.input_ty(insn, 0);
2212                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2213                 let inst = if ty_has_int_representation(input_ty) {
2214                     Inst::VecDup { rd, rn, size }
2215                 } else {
2216                     Inst::VecDupFromFpu { rd, rn, size }
2217                 };
2218 
2219                 ctx.emit(inst);
2220             }
2221         }
2222 
2223         Opcode::ScalarToVector => {
2224             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2225             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2226             let input_ty = ctx.input_ty(insn, 0);
2227             if (input_ty == I32 && ty.unwrap() == I32X4)
2228                 || (input_ty == I64 && ty.unwrap() == I64X2)
2229             {
2230                 ctx.emit(Inst::MovToFpu {
2231                     rd,
2232                     rn,
2233                     size: ScalarSize::from_ty(input_ty),
2234                 });
2235             } else {
2236                 return Err(CodegenError::Unsupported(format!(
2237                     "ScalarToVector: unsupported types {:?} -> {:?}",
2238                     input_ty, ty
2239                 )));
2240             }
2241         }
2242 
2243         Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
2244             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2245             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2246             let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
2247 
2248             // cmeq vtmp.2d, vm.2d, #0
2249             // addp dtmp, vtmp.2d
2250             // fcmp dtmp, dtmp
2251             // cset xd, eq
2252             //
2253             // Note that after the ADDP the value of the temporary register will
2254             // be either 0 when all input elements are true, i.e. non-zero, or a
2255             // NaN otherwise (either -1 or -2 when represented as an integer);
2256             // NaNs are the only floating-point numbers that compare unequal to
2257             // themselves.
2258 
2259             ctx.emit(Inst::VecMisc {
2260                 op: VecMisc2::Cmeq0,
2261                 rd: tmp,
2262                 rn: rm,
2263                 size: VectorSize::Size64x2,
2264             });
2265             ctx.emit(Inst::VecRRPair {
2266                 op: VecPairOp::Addp,
2267                 rd: tmp,
2268                 rn: tmp.to_reg(),
2269             });
2270             ctx.emit(Inst::FpuCmp64 {
2271                 rn: tmp.to_reg(),
2272                 rm: tmp.to_reg(),
2273             });
2274             materialize_bool_result(ctx, insn, rd, Cond::Eq);
2275         }
2276 
2277         Opcode::VanyTrue | Opcode::VallTrue => {
2278             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2279             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2280             let src_ty = ctx.input_ty(insn, 0);
2281             let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
2282 
2283             // This operation is implemented by using umaxp or uminv to
2284             // create a scalar value, which is then compared against zero.
2285             //
2286             // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
2287             // mov xm, vn.d[0]
2288             // cmp xm, #0
2289             // cset xm, ne
2290 
2291             let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
2292 
2293             if op == Opcode::VanyTrue {
2294                 ctx.emit(Inst::VecRRR {
2295                     alu_op: VecALUOp::Umaxp,
2296                     rd: tmp,
2297                     rn: rm,
2298                     rm: rm,
2299                     size,
2300                 });
2301             } else {
2302                 ctx.emit(Inst::VecLanes {
2303                     op: VecLanesOp::Uminv,
2304                     rd: tmp,
2305                     rn: rm,
2306                     size,
2307                 });
2308             };
2309 
2310             ctx.emit(Inst::MovFromVec {
2311                 rd,
2312                 rn: tmp.to_reg(),
2313                 idx: 0,
2314                 size: VectorSize::Size64x2,
2315             });
2316 
2317             ctx.emit(Inst::AluRRImm12 {
2318                 alu_op: ALUOp::SubS64,
2319                 rd: writable_zero_reg(),
2320                 rn: rd.to_reg(),
2321                 imm12: Imm12::zero(),
2322             });
2323 
2324             materialize_bool_result(ctx, insn, rd, Cond::Ne);
2325         }
2326 
2327         Opcode::VhighBits => {
2328             let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2329             let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2330             let ty = ctx.input_ty(insn, 0);
2331             // All three sequences use one integer temporary and two vector temporaries.  The
2332             // shift is done early so as to give the register allocator the possibility of using
2333             // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
2334             // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
2335             // derivation of these sequences.  Alternative sequences are discussed in
2336             // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
2337             // used here.
2338             let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
2339             let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2340             let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2341             match ty {
2342                 I8X16 => {
2343                     // sshr  tmp_v1.16b, src_v.16b, #7
2344                     // mov   tmp_r0, #0x0201
2345                     // movk  tmp_r0, #0x0804, lsl 16
2346                     // movk  tmp_r0, #0x2010, lsl 32
2347                     // movk  tmp_r0, #0x8040, lsl 48
2348                     // dup   tmp_v0.2d, tmp_r0
2349                     // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
2350                     // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
2351                     // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2352                     // addv  tmp_v0h, tmp_v0.8h
2353                     // mov   dst_r, tmp_v0.h[0]
2354                     ctx.emit(Inst::VecShiftImm {
2355                         op: VecShiftImmOp::Sshr,
2356                         rd: tmp_v1,
2357                         rn: src_v,
2358                         size: VectorSize::Size8x16,
2359                         imm: 7,
2360                     });
2361                     lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
2362                     ctx.emit(Inst::VecRRR {
2363                         alu_op: VecALUOp::And,
2364                         rd: tmp_v1,
2365                         rn: tmp_v1.to_reg(),
2366                         rm: tmp_v0.to_reg(),
2367                         size: VectorSize::Size8x16,
2368                     });
2369                     ctx.emit(Inst::VecExtract {
2370                         rd: tmp_v0,
2371                         rn: tmp_v1.to_reg(),
2372                         rm: tmp_v1.to_reg(),
2373                         imm4: 8,
2374                     });
2375                     ctx.emit(Inst::VecRRR {
2376                         alu_op: VecALUOp::Zip1,
2377                         rd: tmp_v0,
2378                         rn: tmp_v1.to_reg(),
2379                         rm: tmp_v0.to_reg(),
2380                         size: VectorSize::Size8x16,
2381                     });
2382                     ctx.emit(Inst::VecLanes {
2383                         op: VecLanesOp::Addv,
2384                         rd: tmp_v0,
2385                         rn: tmp_v0.to_reg(),
2386                         size: VectorSize::Size16x8,
2387                     });
2388                     ctx.emit(Inst::MovFromVec {
2389                         rd: dst_r,
2390                         rn: tmp_v0.to_reg(),
2391                         idx: 0,
2392                         size: VectorSize::Size16x8,
2393                     });
2394                 }
2395                 I16X8 => {
2396                     // sshr  tmp_v1.8h, src_v.8h, #15
2397                     // mov   tmp_r0, #0x1
2398                     // movk  tmp_r0, #0x2, lsl 16
2399                     // movk  tmp_r0, #0x4, lsl 32
2400                     // movk  tmp_r0, #0x8, lsl 48
2401                     // dup   tmp_v0.2d, tmp_r0
2402                     // shl   tmp_r0, tmp_r0, #4
2403                     // mov   tmp_v0.d[1], tmp_r0
2404                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2405                     // addv  tmp_v0h, tmp_v0.8h
2406                     // mov   dst_r, tmp_v0.h[0]
2407                     ctx.emit(Inst::VecShiftImm {
2408                         op: VecShiftImmOp::Sshr,
2409                         rd: tmp_v1,
2410                         rn: src_v,
2411                         size: VectorSize::Size16x8,
2412                         imm: 15,
2413                     });
2414                     lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
2415                     ctx.emit(Inst::VecDup {
2416                         rd: tmp_v0,
2417                         rn: tmp_r0.to_reg(),
2418                         size: VectorSize::Size64x2,
2419                     });
2420                     ctx.emit(Inst::AluRRImmShift {
2421                         alu_op: ALUOp::Lsl64,
2422                         rd: tmp_r0,
2423                         rn: tmp_r0.to_reg(),
2424                         immshift: ImmShift { imm: 4 },
2425                     });
2426                     ctx.emit(Inst::MovToVec {
2427                         rd: tmp_v0,
2428                         rn: tmp_r0.to_reg(),
2429                         idx: 1,
2430                         size: VectorSize::Size64x2,
2431                     });
2432                     ctx.emit(Inst::VecRRR {
2433                         alu_op: VecALUOp::And,
2434                         rd: tmp_v0,
2435                         rn: tmp_v1.to_reg(),
2436                         rm: tmp_v0.to_reg(),
2437                         size: VectorSize::Size8x16,
2438                     });
2439                     ctx.emit(Inst::VecLanes {
2440                         op: VecLanesOp::Addv,
2441                         rd: tmp_v0,
2442                         rn: tmp_v0.to_reg(),
2443                         size: VectorSize::Size16x8,
2444                     });
2445                     ctx.emit(Inst::MovFromVec {
2446                         rd: dst_r,
2447                         rn: tmp_v0.to_reg(),
2448                         idx: 0,
2449                         size: VectorSize::Size16x8,
2450                     });
2451                 }
2452                 I32X4 => {
2453                     // sshr  tmp_v1.4s, src_v.4s, #31
2454                     // mov   tmp_r0, #0x1
2455                     // movk  tmp_r0, #0x2, lsl 32
2456                     // dup   tmp_v0.2d, tmp_r0
2457                     // shl   tmp_r0, tmp_r0, #2
2458                     // mov   tmp_v0.d[1], tmp_r0
2459                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2460                     // addv  tmp_v0s, tmp_v0.4s
2461                     // mov   dst_r, tmp_v0.s[0]
2462                     ctx.emit(Inst::VecShiftImm {
2463                         op: VecShiftImmOp::Sshr,
2464                         rd: tmp_v1,
2465                         rn: src_v,
2466                         size: VectorSize::Size32x4,
2467                         imm: 31,
2468                     });
2469                     lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
2470                     ctx.emit(Inst::VecDup {
2471                         rd: tmp_v0,
2472                         rn: tmp_r0.to_reg(),
2473                         size: VectorSize::Size64x2,
2474                     });
2475                     ctx.emit(Inst::AluRRImmShift {
2476                         alu_op: ALUOp::Lsl64,
2477                         rd: tmp_r0,
2478                         rn: tmp_r0.to_reg(),
2479                         immshift: ImmShift { imm: 2 },
2480                     });
2481                     ctx.emit(Inst::MovToVec {
2482                         rd: tmp_v0,
2483                         rn: tmp_r0.to_reg(),
2484                         idx: 1,
2485                         size: VectorSize::Size64x2,
2486                     });
2487                     ctx.emit(Inst::VecRRR {
2488                         alu_op: VecALUOp::And,
2489                         rd: tmp_v0,
2490                         rn: tmp_v1.to_reg(),
2491                         rm: tmp_v0.to_reg(),
2492                         size: VectorSize::Size8x16,
2493                     });
2494                     ctx.emit(Inst::VecLanes {
2495                         op: VecLanesOp::Addv,
2496                         rd: tmp_v0,
2497                         rn: tmp_v0.to_reg(),
2498                         size: VectorSize::Size32x4,
2499                     });
2500                     ctx.emit(Inst::MovFromVec {
2501                         rd: dst_r,
2502                         rn: tmp_v0.to_reg(),
2503                         idx: 0,
2504                         size: VectorSize::Size32x4,
2505                     });
2506                 }
2507                 I64X2 => {
2508                     // mov dst_r, src_v.d[0]
2509                     // mov tmp_r0, src_v.d[1]
2510                     // lsr dst_r, dst_r, #63
2511                     // lsr tmp_r0, tmp_r0, #63
2512                     // add dst_r, dst_r, tmp_r0, lsl #1
2513                     ctx.emit(Inst::MovFromVec {
2514                         rd: dst_r,
2515                         rn: src_v,
2516                         idx: 0,
2517                         size: VectorSize::Size64x2,
2518                     });
2519                     ctx.emit(Inst::MovFromVec {
2520                         rd: tmp_r0,
2521                         rn: src_v,
2522                         idx: 1,
2523                         size: VectorSize::Size64x2,
2524                     });
2525                     ctx.emit(Inst::AluRRImmShift {
2526                         alu_op: ALUOp::Lsr64,
2527                         rd: dst_r,
2528                         rn: dst_r.to_reg(),
2529                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
2530                     });
2531                     ctx.emit(Inst::AluRRImmShift {
2532                         alu_op: ALUOp::Lsr64,
2533                         rd: tmp_r0,
2534                         rn: tmp_r0.to_reg(),
2535                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
2536                     });
2537                     ctx.emit(Inst::AluRRRShift {
2538                         alu_op: ALUOp::Add32,
2539                         rd: dst_r,
2540                         rn: dst_r.to_reg(),
2541                         rm: tmp_r0.to_reg(),
2542                         shiftop: ShiftOpAndAmt::new(
2543                             ShiftOp::LSL,
2544                             ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
2545                         ),
2546                     });
2547                 }
2548                 _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
2549             }
2550         }
2551 
2552         Opcode::Shuffle => {
2553             let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
2554             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2555             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2556             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2557             // 2 register table vector lookups require consecutive table registers;
2558             // we satisfy this constraint by hardcoding the usage of v29 and v30.
2559             let temp = writable_vreg(29);
2560             let temp2 = writable_vreg(30);
2561             let input_ty = ctx.input_ty(insn, 0);
2562             assert_eq!(input_ty, ctx.input_ty(insn, 1));
2563             // Make sure that both inputs are in virtual registers, since it is
2564             // not guaranteed that we can get them safely to the temporaries if
2565             // either is in a real register.
2566             let rn = ctx.ensure_in_vreg(rn, input_ty);
2567             let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
2568 
2569             lower_constant_f128(ctx, rd, mask);
2570             ctx.emit(Inst::gen_move(temp, rn, input_ty));
2571             ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
2572             ctx.emit(Inst::VecTbl2 {
2573                 rd,
2574                 rn: temp.to_reg(),
2575                 rn2: temp2.to_reg(),
2576                 rm: rd.to_reg(),
2577                 is_extension: false,
2578             });
2579         }
2580 
2581         Opcode::Swizzle => {
2582             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2583             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2584             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2585 
2586             ctx.emit(Inst::VecTbl {
2587                 rd,
2588                 rn,
2589                 rm,
2590                 is_extension: false,
2591             });
2592         }
2593 
2594         Opcode::Isplit => {
2595             assert_eq!(
2596                 ctx.input_ty(insn, 0),
2597                 I128,
2598                 "Isplit only implemented for i128's"
2599             );
2600             assert_eq!(ctx.output_ty(insn, 0), I64);
2601             assert_eq!(ctx.output_ty(insn, 1), I64);
2602 
2603             let src_regs = put_input_in_regs(ctx, inputs[0]);
2604             let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2605             let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
2606 
2607             ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
2608             ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
2609         }
2610 
2611         Opcode::Iconcat => {
2612             assert_eq!(
2613                 ctx.output_ty(insn, 0),
2614                 I128,
2615                 "Iconcat only implemented for i128's"
2616             );
2617             assert_eq!(ctx.input_ty(insn, 0), I64);
2618             assert_eq!(ctx.input_ty(insn, 1), I64);
2619 
2620             let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2621             let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2622             let dst = get_output_reg(ctx, outputs[0]);
2623 
2624             ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
2625             ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
2626         }
2627 
2628         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
2629             let alu_op = match op {
2630                 Opcode::Umin => VecALUOp::Umin,
2631                 Opcode::Imin => VecALUOp::Smin,
2632                 Opcode::Umax => VecALUOp::Umax,
2633                 Opcode::Imax => VecALUOp::Smax,
2634                 _ => unreachable!(),
2635             };
2636             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2637             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2638             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2639             let ty = ty.unwrap();
2640             ctx.emit(Inst::VecRRR {
2641                 alu_op,
2642                 rd,
2643                 rn,
2644                 rm,
2645                 size: VectorSize::from_ty(ty),
2646             });
2647         }
2648 
2649         Opcode::IaddPairwise => {
2650             let ty = ty.unwrap();
2651             let lane_type = ty.lane_type();
2652             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2653 
2654             let mut match_long_pair =
2655                 |ext_low_op, ext_high_op| -> Option<(VecRRPairLongOp, regalloc::Reg)> {
2656                     if let Some(lhs) = maybe_input_insn(ctx, inputs[0], ext_low_op) {
2657                         if let Some(rhs) = maybe_input_insn(ctx, inputs[1], ext_high_op) {
2658                             let lhs_inputs = insn_inputs(ctx, lhs);
2659                             let rhs_inputs = insn_inputs(ctx, rhs);
2660                             let low = put_input_in_reg(ctx, lhs_inputs[0], NarrowValueMode::None);
2661                             let high = put_input_in_reg(ctx, rhs_inputs[0], NarrowValueMode::None);
2662                             if low == high {
2663                                 match (lane_type, ext_low_op) {
2664                                     (I16, Opcode::SwidenLow) => {
2665                                         return Some((VecRRPairLongOp::Saddlp8, low))
2666                                     }
2667                                     (I32, Opcode::SwidenLow) => {
2668                                         return Some((VecRRPairLongOp::Saddlp16, low))
2669                                     }
2670                                     (I16, Opcode::UwidenLow) => {
2671                                         return Some((VecRRPairLongOp::Uaddlp8, low))
2672                                     }
2673                                     (I32, Opcode::UwidenLow) => {
2674                                         return Some((VecRRPairLongOp::Uaddlp16, low))
2675                                     }
2676                                     _ => (),
2677                                 };
2678                             }
2679                         }
2680                     }
2681                     None
2682                 };
2683 
2684             if let Some((op, rn)) = match_long_pair(Opcode::SwidenLow, Opcode::SwidenHigh) {
2685                 ctx.emit(Inst::VecRRPairLong { op, rd, rn });
2686             } else if let Some((op, rn)) = match_long_pair(Opcode::UwidenLow, Opcode::UwidenHigh) {
2687                 ctx.emit(Inst::VecRRPairLong { op, rd, rn });
2688             } else {
2689                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2690                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2691                 ctx.emit(Inst::VecRRR {
2692                     alu_op: VecALUOp::Addp,
2693                     rd: rd,
2694                     rn: rn,
2695                     rm: rm,
2696                     size: VectorSize::from_ty(ty),
2697                 });
2698             }
2699         }
2700 
2701         Opcode::WideningPairwiseDotProductS => {
2702             let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2703             let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2704             let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2705             let ty = ty.unwrap();
2706             if ty == I32X4 {
2707                 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2708                 // The args have type I16X8.
2709                 // "y = i32x4.dot_i16x8_s(a, b)"
2710                 // => smull  tmp, a, b
2711                 //    smull2 y,   a, b
2712                 //    addp   y,   tmp, y
2713                 ctx.emit(Inst::VecRRRLong {
2714                     alu_op: VecRRRLongOp::Smull16,
2715                     rd: tmp,
2716                     rn: r_a,
2717                     rm: r_b,
2718                     high_half: false,
2719                 });
2720                 ctx.emit(Inst::VecRRRLong {
2721                     alu_op: VecRRRLongOp::Smull16,
2722                     rd: r_y,
2723                     rn: r_a,
2724                     rm: r_b,
2725                     high_half: true,
2726                 });
2727                 ctx.emit(Inst::VecRRR {
2728                     alu_op: VecALUOp::Addp,
2729                     rd: r_y,
2730                     rn: tmp.to_reg(),
2731                     rm: r_y.to_reg(),
2732                     size: VectorSize::Size32x4,
2733                 });
2734             } else {
2735                 return Err(CodegenError::Unsupported(format!(
2736                     "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
2737                     ty
2738                 )));
2739             }
2740         }
2741 
2742         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
2743             let ty = ty.unwrap();
2744             let bits = ty_bits(ty);
2745             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2746             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2747             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2748             if !ty.is_vector() {
2749                 let fpu_op = match (op, bits) {
2750                     (Opcode::Fadd, 32) => FPUOp2::Add32,
2751                     (Opcode::Fadd, 64) => FPUOp2::Add64,
2752                     (Opcode::Fsub, 32) => FPUOp2::Sub32,
2753                     (Opcode::Fsub, 64) => FPUOp2::Sub64,
2754                     (Opcode::Fmul, 32) => FPUOp2::Mul32,
2755                     (Opcode::Fmul, 64) => FPUOp2::Mul64,
2756                     (Opcode::Fdiv, 32) => FPUOp2::Div32,
2757                     (Opcode::Fdiv, 64) => FPUOp2::Div64,
2758                     (Opcode::Fmin, 32) => FPUOp2::Min32,
2759                     (Opcode::Fmin, 64) => FPUOp2::Min64,
2760                     (Opcode::Fmax, 32) => FPUOp2::Max32,
2761                     (Opcode::Fmax, 64) => FPUOp2::Max64,
2762                     _ => panic!("Unknown op/bits combination"),
2763                 };
2764                 ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
2765             } else {
2766                 let alu_op = match op {
2767                     Opcode::Fadd => VecALUOp::Fadd,
2768                     Opcode::Fsub => VecALUOp::Fsub,
2769                     Opcode::Fdiv => VecALUOp::Fdiv,
2770                     Opcode::Fmax => VecALUOp::Fmax,
2771                     Opcode::Fmin => VecALUOp::Fmin,
2772                     Opcode::Fmul => VecALUOp::Fmul,
2773                     _ => unreachable!(),
2774                 };
2775 
2776                 ctx.emit(Inst::VecRRR {
2777                     rd,
2778                     rn,
2779                     rm,
2780                     alu_op,
2781                     size: VectorSize::from_ty(ty),
2782                 });
2783             }
2784         }
2785 
2786         Opcode::FminPseudo | Opcode::FmaxPseudo => {
2787             let ty = ctx.input_ty(insn, 0);
2788             if ty == F32X4 || ty == F64X2 {
2789                 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
2790                 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
2791                 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2792                 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2793                 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2794                 // Since we're going to write the output register `r_dst` anyway, we might as
2795                 // well first use it to hold the comparison result.  This has the slightly unusual
2796                 // effect that we modify the output register in the first instruction (`fcmgt`)
2797                 // but read both the inputs again in the second instruction (`bsl`), which means
2798                 // that the output register can't be either of the input registers.  Regalloc
2799                 // should handle this correctly, nevertheless.
2800                 ctx.emit(Inst::VecRRR {
2801                     alu_op: VecALUOp::Fcmgt,
2802                     rd: r_dst,
2803                     rn: if op == Opcode::FminPseudo { r_a } else { r_b },
2804                     rm: if op == Opcode::FminPseudo { r_b } else { r_a },
2805                     size: if ty == F32X4 {
2806                         VectorSize::Size32x4
2807                     } else {
2808                         VectorSize::Size64x2
2809                     },
2810                 });
2811                 ctx.emit(Inst::VecRRR {
2812                     alu_op: VecALUOp::Bsl,
2813                     rd: r_dst,
2814                     rn: r_b,
2815                     rm: r_a,
2816                     size: VectorSize::Size8x16,
2817                 });
2818             } else {
2819                 panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
2820             }
2821         }
2822 
2823         Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
2824             let ty = ty.unwrap();
2825             let bits = ty_bits(ty);
2826             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2827             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2828             if !ty.is_vector() {
2829                 let fpu_op = match (op, bits) {
2830                     (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
2831                     (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
2832                     (Opcode::Fneg, 32) => FPUOp1::Neg32,
2833                     (Opcode::Fneg, 64) => FPUOp1::Neg64,
2834                     (Opcode::Fabs, 32) => FPUOp1::Abs32,
2835                     (Opcode::Fabs, 64) => FPUOp1::Abs64,
2836                     (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
2837                     (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
2838                     (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
2839                     (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
2840                     _ => panic!("Unknown op/bits combination"),
2841                 };
2842                 ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
2843             } else {
2844                 let op = match op {
2845                     Opcode::Fabs => VecMisc2::Fabs,
2846                     Opcode::Fneg => VecMisc2::Fneg,
2847                     Opcode::Sqrt => VecMisc2::Fsqrt,
2848                     _ => unimplemented!(),
2849                 };
2850 
2851                 ctx.emit(Inst::VecMisc {
2852                     op,
2853                     rd,
2854                     rn,
2855                     size: VectorSize::from_ty(ty),
2856                 });
2857             }
2858         }
2859 
2860         Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
2861             let ty = ctx.output_ty(insn, 0);
2862             if !ty.is_vector() {
2863                 let bits = ty_bits(ty);
2864                 let op = match (op, bits) {
2865                     (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2866                     (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2867                     (Opcode::Floor, 32) => FpuRoundMode::Minus32,
2868                     (Opcode::Floor, 64) => FpuRoundMode::Minus64,
2869                     (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2870                     (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2871                     (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2872                     (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2873                     _ => panic!("Unknown op/bits combination (scalar)"),
2874                 };
2875                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2876                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2877                 ctx.emit(Inst::FpuRound { op, rd, rn });
2878             } else {
2879                 let (op, size) = match (op, ty) {
2880                     (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
2881                     (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
2882                     (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
2883                     (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
2884                     (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
2885                     (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
2886                     (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
2887                     (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
2888                     _ => panic!("Unknown op/ty combination (vector){:?}", ty),
2889                 };
2890                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2891                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2892                 ctx.emit(Inst::VecMisc { op, rd, rn, size });
2893             }
2894         }
2895 
2896         Opcode::Fma => {
2897             let bits = ty_bits(ctx.output_ty(insn, 0));
2898             let fpu_op = match bits {
2899                 32 => FPUOp3::MAdd32,
2900                 64 => FPUOp3::MAdd64,
2901                 _ => panic!("Unknown op size"),
2902             };
2903             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2904             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2905             let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
2906             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2907             ctx.emit(Inst::FpuRRRR {
2908                 fpu_op,
2909                 rn,
2910                 rm,
2911                 ra,
2912                 rd,
2913             });
2914         }
2915 
2916         Opcode::Fcopysign => {
2917             // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
2918             //
2919             // This is a scalar Fcopysign.
2920             // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
2921             // In the latter case it still sets all bits except the lowest 32 to 0.
2922             //
2923             //  mov vd, vn
2924             //  ushr vtmp, vm, #63 / #31
2925             //  sli vd, vtmp, #63 / #31
2926 
2927             let ty = ctx.output_ty(insn, 0);
2928             let bits = ty_bits(ty) as u8;
2929             assert!(bits == 32 || bits == 64);
2930             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2931             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2932             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2933             let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
2934 
2935             // Copy LHS to rd.
2936             ctx.emit(Inst::gen_move(rd, rn, ty));
2937 
2938             // Copy the sign bit to the lowest bit in tmp.
2939             let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2940             ctx.emit(Inst::FpuRRI {
2941                 fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
2942                 rd: tmp,
2943                 rn: rm,
2944             });
2945 
2946             // Insert the bit from tmp into the sign bit of rd.
2947             let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2948             ctx.emit(Inst::FpuRRI {
2949                 fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
2950                 rd,
2951                 rn: tmp.to_reg(),
2952             });
2953         }
2954 
2955         Opcode::FcvtToUint | Opcode::FcvtToSint => {
2956             let in_bits = ty_bits(ctx.input_ty(insn, 0));
2957             let out_bits = ty_bits(ctx.output_ty(insn, 0));
2958             let signed = op == Opcode::FcvtToSint;
2959             let op = match (signed, in_bits, out_bits) {
2960                 (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
2961                 (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
2962                 (false, 32, 64) => FpuToIntOp::F32ToU64,
2963                 (true, 32, 64) => FpuToIntOp::F32ToI64,
2964                 (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
2965                 (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
2966                 (false, 64, 64) => FpuToIntOp::F64ToU64,
2967                 (true, 64, 64) => FpuToIntOp::F64ToI64,
2968                 _ => panic!("Unknown input/output-bits combination"),
2969             };
2970 
2971             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2972             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2973 
2974             // First, check the output: it's important to carry the NaN conversion before the
2975             // in-bounds conversion, per wasm semantics.
2976 
2977             // Check that the input is not a NaN.
2978             if in_bits == 32 {
2979                 ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
2980             } else {
2981                 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
2982             }
2983             let trap_code = TrapCode::BadConversionToInteger;
2984             ctx.emit(Inst::TrapIf {
2985                 trap_code,
2986                 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
2987             });
2988 
2989             let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2990 
2991             // Check that the input is in range, with "truncate towards zero" semantics. This means
2992             // we allow values that are slightly out of range:
2993             // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
2994             // can be represented), and strictly less than INT_MAX+1 (when this can be
2995             // represented).
2996             // - for unsigned conversions, we allow values strictly greater than -1, and strictly
2997             // less than UINT_MAX+1 (when this can be represented).
2998 
2999             if in_bits == 32 {
3000                 // From float32.
3001                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
3002                     (true, 8) => (
3003                         i8::min_value() as f32 - 1.,
3004                         FloatCC::GreaterThan,
3005                         i8::max_value() as f32 + 1.,
3006                     ),
3007                     (true, 16) => (
3008                         i16::min_value() as f32 - 1.,
3009                         FloatCC::GreaterThan,
3010                         i16::max_value() as f32 + 1.,
3011                     ),
3012                     (true, 32) => (
3013                         i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
3014                         FloatCC::GreaterThanOrEqual,
3015                         i32::max_value() as f32 + 1.,
3016                     ),
3017                     (true, 64) => (
3018                         i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
3019                         FloatCC::GreaterThanOrEqual,
3020                         i64::max_value() as f32 + 1.,
3021                     ),
3022                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
3023                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
3024                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
3025                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
3026                     _ => panic!("Unknown input/output-bits combination"),
3027                 };
3028 
3029                 // >= low_bound
3030                 lower_constant_f32(ctx, tmp, low_bound);
3031                 ctx.emit(Inst::FpuCmp32 {
3032                     rn,
3033                     rm: tmp.to_reg(),
3034                 });
3035                 let trap_code = TrapCode::IntegerOverflow;
3036                 ctx.emit(Inst::TrapIf {
3037                     trap_code,
3038                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
3039                 });
3040 
3041                 // <= high_bound
3042                 lower_constant_f32(ctx, tmp, high_bound);
3043                 ctx.emit(Inst::FpuCmp32 {
3044                     rn,
3045                     rm: tmp.to_reg(),
3046                 });
3047                 let trap_code = TrapCode::IntegerOverflow;
3048                 ctx.emit(Inst::TrapIf {
3049                     trap_code,
3050                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
3051                 });
3052             } else {
3053                 // From float64.
3054                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
3055                     (true, 8) => (
3056                         i8::min_value() as f64 - 1.,
3057                         FloatCC::GreaterThan,
3058                         i8::max_value() as f64 + 1.,
3059                     ),
3060                     (true, 16) => (
3061                         i16::min_value() as f64 - 1.,
3062                         FloatCC::GreaterThan,
3063                         i16::max_value() as f64 + 1.,
3064                     ),
3065                     (true, 32) => (
3066                         i32::min_value() as f64 - 1.,
3067                         FloatCC::GreaterThan,
3068                         i32::max_value() as f64 + 1.,
3069                     ),
3070                     (true, 64) => (
3071                         i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
3072                         FloatCC::GreaterThanOrEqual,
3073                         i64::max_value() as f64 + 1.,
3074                     ),
3075                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
3076                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
3077                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
3078                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
3079                     _ => panic!("Unknown input/output-bits combination"),
3080                 };
3081 
3082                 // >= low_bound
3083                 lower_constant_f64(ctx, tmp, low_bound);
3084                 ctx.emit(Inst::FpuCmp64 {
3085                     rn,
3086                     rm: tmp.to_reg(),
3087                 });
3088                 let trap_code = TrapCode::IntegerOverflow;
3089                 ctx.emit(Inst::TrapIf {
3090                     trap_code,
3091                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
3092                 });
3093 
3094                 // <= high_bound
3095                 lower_constant_f64(ctx, tmp, high_bound);
3096                 ctx.emit(Inst::FpuCmp64 {
3097                     rn,
3098                     rm: tmp.to_reg(),
3099                 });
3100                 let trap_code = TrapCode::IntegerOverflow;
3101                 ctx.emit(Inst::TrapIf {
3102                     trap_code,
3103                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
3104                 });
3105             };
3106 
3107             // Do the conversion.
3108             ctx.emit(Inst::FpuToInt { op, rd, rn });
3109         }
3110 
3111         Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
3112             let ty = ty.unwrap();
3113             let signed = op == Opcode::FcvtFromSint;
3114             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3115 
3116             if ty.is_vector() {
3117                 let op = if signed {
3118                     VecMisc2::Scvtf
3119                 } else {
3120                     VecMisc2::Ucvtf
3121                 };
3122                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3123 
3124                 ctx.emit(Inst::VecMisc {
3125                     op,
3126                     rd,
3127                     rn,
3128                     size: VectorSize::from_ty(ty),
3129                 });
3130             } else {
3131                 let in_bits = ty_bits(ctx.input_ty(insn, 0));
3132                 let out_bits = ty_bits(ty);
3133                 let op = match (signed, in_bits, out_bits) {
3134                     (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
3135                     (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
3136                     (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
3137                     (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
3138                     (false, 64, 32) => IntToFpuOp::U64ToF32,
3139                     (true, 64, 32) => IntToFpuOp::I64ToF32,
3140                     (false, 64, 64) => IntToFpuOp::U64ToF64,
3141                     (true, 64, 64) => IntToFpuOp::I64ToF64,
3142                     _ => panic!("Unknown input/output-bits combination"),
3143                 };
3144                 let narrow_mode = match (signed, in_bits) {
3145                     (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
3146                     (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
3147                     (false, 64) => NarrowValueMode::ZeroExtend64,
3148                     (true, 64) => NarrowValueMode::SignExtend64,
3149                     _ => panic!("Unknown input size"),
3150                 };
3151                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
3152                 ctx.emit(Inst::IntToFpu { op, rd, rn });
3153             }
3154         }
3155 
3156         Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
3157             let ty = ty.unwrap();
3158             let out_signed = op == Opcode::FcvtToSintSat;
3159             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3160             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3161 
3162             if ty.is_vector() {
3163                 let op = if out_signed {
3164                     VecMisc2::Fcvtzs
3165                 } else {
3166                     VecMisc2::Fcvtzu
3167                 };
3168 
3169                 ctx.emit(Inst::VecMisc {
3170                     op,
3171                     rd,
3172                     rn,
3173                     size: VectorSize::from_ty(ty),
3174                 });
3175             } else {
3176                 let in_ty = ctx.input_ty(insn, 0);
3177                 let in_bits = ty_bits(in_ty);
3178                 let out_bits = ty_bits(ty);
3179                 // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
3180                 // FMIN Vtmp2, Vin, Vtmp1
3181                 // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
3182                 // FMAX Vtmp2, Vtmp2, Vtmp1
3183                 // (if signed) FIMM Vtmp1, 0
3184                 // FCMP Vin, Vin
3185                 // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
3186                 // convert Rout, Vtmp2
3187 
3188                 assert!(in_bits == 32 || in_bits == 64);
3189                 assert!(out_bits == 32 || out_bits == 64);
3190 
3191                 let min: f64 = match (out_bits, out_signed) {
3192                     (32, true) => std::i32::MIN as f64,
3193                     (32, false) => 0.0,
3194                     (64, true) => std::i64::MIN as f64,
3195                     (64, false) => 0.0,
3196                     _ => unreachable!(),
3197                 };
3198 
3199                 let max = match (out_bits, out_signed) {
3200                     (32, true) => std::i32::MAX as f64,
3201                     (32, false) => std::u32::MAX as f64,
3202                     (64, true) => std::i64::MAX as f64,
3203                     (64, false) => std::u64::MAX as f64,
3204                     _ => unreachable!(),
3205                 };
3206 
3207                 let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
3208                 let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
3209 
3210                 if in_bits == 32 {
3211                     lower_constant_f32(ctx, rtmp1, max as f32);
3212                 } else {
3213                     lower_constant_f64(ctx, rtmp1, max);
3214                 }
3215                 ctx.emit(Inst::FpuRRR {
3216                     fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
3217                     rd: rtmp2,
3218                     rn: rn,
3219                     rm: rtmp1.to_reg(),
3220                 });
3221                 if in_bits == 32 {
3222                     lower_constant_f32(ctx, rtmp1, min as f32);
3223                 } else {
3224                     lower_constant_f64(ctx, rtmp1, min);
3225                 }
3226                 ctx.emit(Inst::FpuRRR {
3227                     fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
3228                     rd: rtmp2,
3229                     rn: rtmp2.to_reg(),
3230                     rm: rtmp1.to_reg(),
3231                 });
3232                 if out_signed {
3233                     if in_bits == 32 {
3234                         lower_constant_f32(ctx, rtmp1, 0.0);
3235                     } else {
3236                         lower_constant_f64(ctx, rtmp1, 0.0);
3237                     }
3238                 }
3239                 if in_bits == 32 {
3240                     ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
3241                     ctx.emit(Inst::FpuCSel32 {
3242                         rd: rtmp2,
3243                         rn: rtmp1.to_reg(),
3244                         rm: rtmp2.to_reg(),
3245                         cond: Cond::Ne,
3246                     });
3247                 } else {
3248                     ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
3249                     ctx.emit(Inst::FpuCSel64 {
3250                         rd: rtmp2,
3251                         rn: rtmp1.to_reg(),
3252                         rm: rtmp2.to_reg(),
3253                         cond: Cond::Ne,
3254                     });
3255                 }
3256 
3257                 let cvt = match (in_bits, out_bits, out_signed) {
3258                     (32, 32, false) => FpuToIntOp::F32ToU32,
3259                     (32, 32, true) => FpuToIntOp::F32ToI32,
3260                     (32, 64, false) => FpuToIntOp::F32ToU64,
3261                     (32, 64, true) => FpuToIntOp::F32ToI64,
3262                     (64, 32, false) => FpuToIntOp::F64ToU32,
3263                     (64, 32, true) => FpuToIntOp::F64ToI32,
3264                     (64, 64, false) => FpuToIntOp::F64ToU64,
3265                     (64, 64, true) => FpuToIntOp::F64ToI64,
3266                     _ => unreachable!(),
3267                 };
3268                 ctx.emit(Inst::FpuToInt {
3269                     op: cvt,
3270                     rd,
3271                     rn: rtmp2.to_reg(),
3272                 });
3273             }
3274         }
3275 
3276         Opcode::IaddIfcout => {
3277             // This is a two-output instruction that is needed for the
3278             // legalizer's explicit heap-check sequence, among possible other
3279             // uses. Its second output is a flags output only ever meant to
3280             // check for overflow using the
3281             // `backend.unsigned_add_overflow_condition()` condition.
3282             //
3283             // Note that the CLIF validation will ensure that no flag-setting
3284             // operation comes between this IaddIfcout and its use (e.g., a
3285             // Trapif). Thus, we can rely on implicit communication through the
3286             // processor flags rather than explicitly generating flags into a
3287             // register. We simply use the variant of the add instruction that
3288             // sets flags (`adds`) here.
3289 
3290             // Note that the second output (the flags) need not be generated,
3291             // because flags are never materialized into a register; the only
3292             // instructions that can use a value of type `iflags` or `fflags`
3293             // will look directly for the flags-producing instruction (which can
3294             // always be found, by construction) and merge it.
3295 
3296             // Now handle the iadd as above, except use an AddS opcode that sets
3297             // flags.
3298             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3299             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3300             let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
3301             let ty = ty.unwrap();
3302             let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
3303             ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
3304         }
3305 
3306         Opcode::IaddImm
3307         | Opcode::ImulImm
3308         | Opcode::UdivImm
3309         | Opcode::SdivImm
3310         | Opcode::UremImm
3311         | Opcode::SremImm
3312         | Opcode::IrsubImm
3313         | Opcode::IaddCin
3314         | Opcode::IaddIfcin
3315         | Opcode::IaddCout
3316         | Opcode::IaddCarry
3317         | Opcode::IaddIfcarry
3318         | Opcode::IsubBin
3319         | Opcode::IsubIfbin
3320         | Opcode::IsubBout
3321         | Opcode::IsubIfbout
3322         | Opcode::IsubBorrow
3323         | Opcode::IsubIfborrow
3324         | Opcode::BandImm
3325         | Opcode::BorImm
3326         | Opcode::BxorImm
3327         | Opcode::RotlImm
3328         | Opcode::RotrImm
3329         | Opcode::IshlImm
3330         | Opcode::UshrImm
3331         | Opcode::SshrImm
3332         | Opcode::IcmpImm
3333         | Opcode::IfcmpImm => {
3334             panic!("ALU+imm and ALU+carry ops should not appear here!");
3335         }
3336 
3337         #[cfg(feature = "x86")]
3338         Opcode::X86Udivmodx
3339         | Opcode::X86Sdivmodx
3340         | Opcode::X86Umulx
3341         | Opcode::X86Smulx
3342         | Opcode::X86Cvtt2si
3343         | Opcode::X86Fmin
3344         | Opcode::X86Fmax
3345         | Opcode::X86Push
3346         | Opcode::X86Pop
3347         | Opcode::X86Bsr
3348         | Opcode::X86Bsf
3349         | Opcode::X86Pblendw
3350         | Opcode::X86Pshufd
3351         | Opcode::X86Pshufb
3352         | Opcode::X86Pextr
3353         | Opcode::X86Pinsr
3354         | Opcode::X86Insertps
3355         | Opcode::X86Movsd
3356         | Opcode::X86Movlhps
3357         | Opcode::X86Palignr
3358         | Opcode::X86Psll
3359         | Opcode::X86Psrl
3360         | Opcode::X86Psra
3361         | Opcode::X86Ptest
3362         | Opcode::X86Pmaxs
3363         | Opcode::X86Pmaxu
3364         | Opcode::X86Pmins
3365         | Opcode::X86Pminu
3366         | Opcode::X86Pmullq
3367         | Opcode::X86Pmuludq
3368         | Opcode::X86Punpckh
3369         | Opcode::X86Punpckl
3370         | Opcode::X86Vcvtudq2ps
3371         | Opcode::X86ElfTlsGetAddr
3372         | Opcode::X86MachoTlsGetAddr => {
3373             panic!("x86-specific opcode in supposedly arch-neutral IR!");
3374         }
3375 
3376         Opcode::DummySargT => unreachable!(),
3377 
3378         Opcode::Iabs => {
3379             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3380             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3381             let ty = ty.unwrap();
3382             ctx.emit(Inst::VecMisc {
3383                 op: VecMisc2::Abs,
3384                 rd,
3385                 rn,
3386                 size: VectorSize::from_ty(ty),
3387             });
3388         }
3389         Opcode::AvgRound => {
3390             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3391             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3392             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3393             let ty = ty.unwrap();
3394             ctx.emit(Inst::VecRRR {
3395                 alu_op: VecALUOp::Urhadd,
3396                 rd,
3397                 rn,
3398                 rm,
3399                 size: VectorSize::from_ty(ty),
3400             });
3401         }
3402 
3403         Opcode::Snarrow | Opcode::Unarrow | Opcode::Uunarrow => {
3404             let nonzero_high_half = maybe_input_insn(ctx, inputs[1], Opcode::Vconst)
3405                 .map_or(true, |insn| {
3406                     const_param_to_u128(ctx, insn).expect("Invalid immediate bytes") != 0
3407                 });
3408             let op = match (op, ty.unwrap().lane_type()) {
3409                 (Opcode::Snarrow, I8) => VecRRNarrowOp::Sqxtn16,
3410                 (Opcode::Snarrow, I16) => VecRRNarrowOp::Sqxtn32,
3411                 (Opcode::Snarrow, I32) => VecRRNarrowOp::Sqxtn64,
3412                 (Opcode::Unarrow, I8) => VecRRNarrowOp::Sqxtun16,
3413                 (Opcode::Unarrow, I16) => VecRRNarrowOp::Sqxtun32,
3414                 (Opcode::Unarrow, I32) => VecRRNarrowOp::Sqxtun64,
3415                 (Opcode::Uunarrow, I8) => VecRRNarrowOp::Uqxtn16,
3416                 (Opcode::Uunarrow, I16) => VecRRNarrowOp::Uqxtn32,
3417                 (Opcode::Uunarrow, I32) => VecRRNarrowOp::Uqxtn64,
3418                 (_, lane_type) => {
3419                     return Err(CodegenError::Unsupported(format!(
3420                         "Unsupported SIMD vector lane type: {:?}",
3421                         lane_type
3422                     )))
3423                 }
3424             };
3425             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3426             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3427 
3428             ctx.emit(Inst::VecRRNarrow {
3429                 op,
3430                 rd,
3431                 rn,
3432                 high_half: false,
3433             });
3434 
3435             if nonzero_high_half {
3436                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3437 
3438                 ctx.emit(Inst::VecRRNarrow {
3439                     op,
3440                     rd,
3441                     rn,
3442                     high_half: true,
3443                 });
3444             }
3445         }
3446 
3447         Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
3448             let lane_type = ty.unwrap().lane_type();
3449             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3450             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3451             let (t, high_half) = match (lane_type, op) {
3452                 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
3453                 (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
3454                 (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
3455                 (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
3456                 (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
3457                 (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
3458                 (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
3459                 (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
3460                 (I64, Opcode::SwidenLow) => (VecExtendOp::Sxtl32, false),
3461                 (I64, Opcode::SwidenHigh) => (VecExtendOp::Sxtl32, true),
3462                 (I64, Opcode::UwidenLow) => (VecExtendOp::Uxtl32, false),
3463                 (I64, Opcode::UwidenHigh) => (VecExtendOp::Uxtl32, true),
3464                 _ => {
3465                     return Err(CodegenError::Unsupported(format!(
3466                         "Unsupported SIMD vector lane type: {:?}",
3467                         lane_type
3468                     )));
3469                 }
3470             };
3471 
3472             ctx.emit(Inst::VecExtend {
3473                 t,
3474                 rd,
3475                 rn,
3476                 high_half,
3477             });
3478         }
3479 
3480         Opcode::TlsValue => match flags.tls_model() {
3481             TlsModel::ElfGd => {
3482                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3483                 let (name, _, _) = ctx.symbol_value(insn).unwrap();
3484                 let symbol = name.clone();
3485                 ctx.emit(Inst::ElfTlsGetAddr { symbol });
3486 
3487                 let x0 = xreg(0);
3488                 ctx.emit(Inst::gen_move(dst, x0, I64));
3489             }
3490             _ => {
3491                 todo!(
3492                     "Unimplemented TLS model in AArch64 backend: {:?}",
3493                     flags.tls_model()
3494                 );
3495             }
3496         },
3497 
3498         Opcode::SqmulRoundSat => {
3499             let ty = ty.unwrap();
3500 
3501             if !ty.is_vector() || (ty.lane_type() != I16 && ty.lane_type() != I32) {
3502                 return Err(CodegenError::Unsupported(format!(
3503                     "Unsupported type: {:?}",
3504                     ty
3505                 )));
3506             }
3507 
3508             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3509             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3510             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3511 
3512             ctx.emit(Inst::VecRRR {
3513                 alu_op: VecALUOp::Sqrdmulh,
3514                 rd,
3515                 rn,
3516                 rm,
3517                 size: VectorSize::from_ty(ty),
3518             });
3519         }
3520 
3521         Opcode::FcvtLowFromSint => {
3522             let ty = ty.unwrap();
3523 
3524             if ty != F64X2 {
3525                 return Err(CodegenError::Unsupported(format!(
3526                     "FcvtLowFromSint: Unsupported type: {:?}",
3527                     ty
3528                 )));
3529             }
3530 
3531             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3532             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3533 
3534             ctx.emit(Inst::VecExtend {
3535                 t: VecExtendOp::Sxtl32,
3536                 rd,
3537                 rn,
3538                 high_half: false,
3539             });
3540             ctx.emit(Inst::VecMisc {
3541                 op: VecMisc2::Scvtf,
3542                 rd,
3543                 rn: rd.to_reg(),
3544                 size: VectorSize::Size64x2,
3545             });
3546         }
3547 
3548         Opcode::FvpromoteLow => {
3549             debug_assert_eq!(ty.unwrap(), F64X2);
3550 
3551             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3552             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3553 
3554             ctx.emit(Inst::VecRRLong {
3555                 op: VecRRLongOp::Fcvtl32,
3556                 rd,
3557                 rn,
3558                 high_half: false,
3559             });
3560         }
3561 
3562         Opcode::Fvdemote => {
3563             debug_assert_eq!(ty.unwrap(), F32X4);
3564 
3565             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3566             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3567 
3568             ctx.emit(Inst::VecRRNarrow {
3569                 op: VecRRNarrowOp::Fcvtn64,
3570                 rd,
3571                 rn,
3572                 high_half: false,
3573             });
3574         }
3575 
3576         Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
3577             unimplemented!("lowering {}", op)
3578         }
3579     }
3580 
3581     Ok(())
3582 }
3583 
lower_branch<C: LowerCtx<I = Inst>>( ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>3584 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
3585     ctx: &mut C,
3586     branches: &[IRInst],
3587     targets: &[MachLabel],
3588 ) -> CodegenResult<()> {
3589     // A block should end with at most two branches. The first may be a
3590     // conditional branch; a conditional branch can be followed only by an
3591     // unconditional branch or fallthrough. Otherwise, if only one branch,
3592     // it may be an unconditional branch, a fallthrough, a return, or a
3593     // trap. These conditions are verified by `is_ebb_basic()` during the
3594     // verifier pass.
3595     assert!(branches.len() <= 2);
3596 
3597     if branches.len() == 2 {
3598         // Must be a conditional branch followed by an unconditional branch.
3599         let op0 = ctx.data(branches[0]).opcode();
3600         let op1 = ctx.data(branches[1]).opcode();
3601 
3602         assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
3603         let taken = BranchTarget::Label(targets[0]);
3604         // not_taken target is the target of the second branch, even if it is a Fallthrough
3605         // instruction: because we reorder blocks while we lower, the fallthrough in the new
3606         // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
3607         // explicitly-provided target.
3608         let not_taken = BranchTarget::Label(targets[1]);
3609 
3610         match op0 {
3611             Opcode::Brz | Opcode::Brnz => {
3612                 let ty = ctx.input_ty(branches[0], 0);
3613                 let flag_input = InsnInput {
3614                     insn: branches[0],
3615                     input: 0,
3616                 };
3617                 if let Some(icmp_insn) =
3618                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
3619                 {
3620                     let condcode = ctx.data(icmp_insn).cond_code().unwrap();
3621                     let cond =
3622                         lower_icmp(ctx, icmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
3623                     let negated = op0 == Opcode::Brz;
3624                     let cond = if negated { cond.invert() } else { cond };
3625 
3626                     ctx.emit(Inst::CondBr {
3627                         taken,
3628                         not_taken,
3629                         kind: CondBrKind::Cond(cond),
3630                     });
3631                 } else if let Some(fcmp_insn) =
3632                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
3633                 {
3634                     let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
3635                     let cond = lower_fp_condcode(condcode);
3636                     let negated = op0 == Opcode::Brz;
3637                     let cond = if negated { cond.invert() } else { cond };
3638 
3639                     lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
3640                     ctx.emit(Inst::CondBr {
3641                         taken,
3642                         not_taken,
3643                         kind: CondBrKind::Cond(cond),
3644                     });
3645                 } else {
3646                     let rt = if ty == I128 {
3647                         let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
3648                         let input = put_input_in_regs(ctx, flag_input);
3649                         ctx.emit(Inst::AluRRR {
3650                             alu_op: ALUOp::Orr64,
3651                             rd: tmp,
3652                             rn: input.regs()[0],
3653                             rm: input.regs()[1],
3654                         });
3655                         tmp.to_reg()
3656                     } else {
3657                         put_input_in_reg(ctx, flag_input, NarrowValueMode::ZeroExtend64)
3658                     };
3659                     let kind = match op0 {
3660                         Opcode::Brz => CondBrKind::Zero(rt),
3661                         Opcode::Brnz => CondBrKind::NotZero(rt),
3662                         _ => unreachable!(),
3663                     };
3664                     ctx.emit(Inst::CondBr {
3665                         taken,
3666                         not_taken,
3667                         kind,
3668                     });
3669                 }
3670             }
3671             Opcode::BrIcmp => {
3672                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3673                 let cond =
3674                     lower_icmp(ctx, branches[0], condcode, IcmpOutput::CondCode)?.unwrap_cond();
3675 
3676                 ctx.emit(Inst::CondBr {
3677                     taken,
3678                     not_taken,
3679                     kind: CondBrKind::Cond(cond),
3680                 });
3681             }
3682 
3683             Opcode::Brif => {
3684                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3685 
3686                 let flag_input = InsnInput {
3687                     insn: branches[0],
3688                     input: 0,
3689                 };
3690                 if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
3691                     let cond =
3692                         lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
3693                     ctx.emit(Inst::CondBr {
3694                         taken,
3695                         not_taken,
3696                         kind: CondBrKind::Cond(cond),
3697                     });
3698                 } else {
3699                     // If the ifcmp result is actually placed in a
3700                     // register, we need to move it back into the flags.
3701                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3702                     ctx.emit(Inst::MovToNZCV { rn });
3703                     ctx.emit(Inst::CondBr {
3704                         taken,
3705                         not_taken,
3706                         kind: CondBrKind::Cond(lower_condcode(condcode)),
3707                     });
3708                 }
3709             }
3710 
3711             Opcode::Brff => {
3712                 let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
3713                 let cond = lower_fp_condcode(condcode);
3714                 let kind = CondBrKind::Cond(cond);
3715                 let flag_input = InsnInput {
3716                     insn: branches[0],
3717                     input: 0,
3718                 };
3719                 if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
3720                     lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
3721                     ctx.emit(Inst::CondBr {
3722                         taken,
3723                         not_taken,
3724                         kind,
3725                     });
3726                 } else {
3727                     // If the ffcmp result is actually placed in a
3728                     // register, we need to move it back into the flags.
3729                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3730                     ctx.emit(Inst::MovToNZCV { rn });
3731                     ctx.emit(Inst::CondBr {
3732                         taken,
3733                         not_taken,
3734                         kind,
3735                     });
3736                 }
3737             }
3738 
3739             _ => unimplemented!(),
3740         }
3741     } else {
3742         // Must be an unconditional branch or an indirect branch.
3743         let op = ctx.data(branches[0]).opcode();
3744         match op {
3745             Opcode::Jump | Opcode::Fallthrough => {
3746                 assert!(branches.len() == 1);
3747                 // In the Fallthrough case, the machine-independent driver
3748                 // fills in `targets[0]` with our fallthrough block, so this
3749                 // is valid for both Jump and Fallthrough.
3750                 ctx.emit(Inst::Jump {
3751                     dest: BranchTarget::Label(targets[0]),
3752                 });
3753             }
3754 
3755             Opcode::BrTable => {
3756                 // Expand `br_table index, default, JT` to:
3757                 //
3758                 //   emit_island  // this forces an island at this point
3759                 //                // if the jumptable would push us past
3760                 //                // the deadline
3761                 //   subs idx, #jt_size
3762                 //   b.hs default
3763                 //   adr vTmp1, PC+16
3764                 //   ldr vTmp2, [vTmp1, idx, lsl #2]
3765                 //   add vTmp2, vTmp2, vTmp1
3766                 //   br vTmp2
3767                 //   [jumptable offsets relative to JT base]
3768                 let jt_size = targets.len() - 1;
3769                 assert!(jt_size <= std::u32::MAX as usize);
3770 
3771                 ctx.emit(Inst::EmitIsland {
3772                     needed_space: 4 * (6 + jt_size) as CodeOffset,
3773                 });
3774 
3775                 let ridx = put_input_in_reg(
3776                     ctx,
3777                     InsnInput {
3778                         insn: branches[0],
3779                         input: 0,
3780                     },
3781                     NarrowValueMode::ZeroExtend32,
3782                 );
3783 
3784                 let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
3785                 let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
3786 
3787                 // Bounds-check, leaving condition codes for JTSequence's
3788                 // branch to default target below.
3789                 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
3790                     ctx.emit(Inst::AluRRImm12 {
3791                         alu_op: ALUOp::SubS32,
3792                         rd: writable_zero_reg(),
3793                         rn: ridx,
3794                         imm12,
3795                     });
3796                 } else {
3797                     lower_constant_u64(ctx, rtmp1, jt_size as u64);
3798                     ctx.emit(Inst::AluRRR {
3799                         alu_op: ALUOp::SubS32,
3800                         rd: writable_zero_reg(),
3801                         rn: ridx,
3802                         rm: rtmp1.to_reg(),
3803                     });
3804                 }
3805 
3806                 // Emit the compound instruction that does:
3807                 //
3808                 // b.hs default
3809                 // adr rA, jt
3810                 // ldrsw rB, [rA, rIndex, UXTW 2]
3811                 // add rA, rA, rB
3812                 // br rA
3813                 // [jt entries]
3814                 //
3815                 // This must be *one* instruction in the vcode because
3816                 // we cannot allow regalloc to insert any spills/fills
3817                 // in the middle of the sequence; otherwise, the ADR's
3818                 // PC-rel offset to the jumptable would be incorrect.
3819                 // (The alternative is to introduce a relocation pass
3820                 // for inlined jumptables, which is much worse, IMHO.)
3821 
3822                 let jt_targets: Vec<BranchTarget> = targets
3823                     .iter()
3824                     .skip(1)
3825                     .map(|bix| BranchTarget::Label(*bix))
3826                     .collect();
3827                 let default_target = BranchTarget::Label(targets[0]);
3828                 let targets_for_term: Vec<MachLabel> = targets.to_vec();
3829                 ctx.emit(Inst::JTSequence {
3830                     ridx,
3831                     rtmp1,
3832                     rtmp2,
3833                     info: Box::new(JTSequenceInfo {
3834                         targets: jt_targets,
3835                         default_target,
3836                         targets_for_term,
3837                     }),
3838                 });
3839             }
3840 
3841             _ => panic!("Unknown branch type!"),
3842         }
3843     }
3844 
3845     Ok(())
3846 }
3847