1 //! Lower a single Cranelift instruction into vcode.
2 
3 use crate::binemit::CodeOffset;
4 use crate::ir::condcodes::FloatCC;
5 use crate::ir::types::*;
6 use crate::ir::Inst as IRInst;
7 use crate::ir::{InstructionData, Opcode, TrapCode};
8 use crate::isa::aarch64::settings as aarch64_settings;
9 use crate::machinst::lower::*;
10 use crate::machinst::*;
11 use crate::settings::Flags;
12 use crate::{CodegenError, CodegenResult};
13 
14 use crate::isa::aarch64::abi::*;
15 use crate::isa::aarch64::inst::*;
16 
17 use regalloc::Writable;
18 
19 use alloc::boxed::Box;
20 use alloc::vec::Vec;
21 use core::convert::TryFrom;
22 
23 use super::lower::*;
24 
25 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()>26 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
27     ctx: &mut C,
28     insn: IRInst,
29     flags: &Flags,
30     isa_flags: &aarch64_settings::Flags,
31 ) -> CodegenResult<()> {
32     let op = ctx.data(insn).opcode();
33     let inputs = insn_inputs(ctx, insn);
34     let outputs = insn_outputs(ctx, insn);
35     let ty = if outputs.len() > 0 {
36         Some(ctx.output_ty(insn, 0))
37     } else {
38         None
39     };
40 
41     match op {
42         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
43             let value = ctx.get_constant(insn).unwrap();
44             // Sign extend constant if necessary
45             let value = match ty.unwrap() {
46                 I8 => (((value as i64) << 56) >> 56) as u64,
47                 I16 => (((value as i64) << 48) >> 48) as u64,
48                 I32 => (((value as i64) << 32) >> 32) as u64,
49                 I64 | R64 => value,
50                 ty if ty.is_bool() => value,
51                 ty => unreachable!("Unknown type for const: {}", ty),
52             };
53             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
54             lower_constant_u64(ctx, rd, value);
55         }
56         Opcode::F32const => {
57             let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
58             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
59             lower_constant_f32(ctx, rd, value);
60         }
61         Opcode::F64const => {
62             let value = f64::from_bits(ctx.get_constant(insn).unwrap());
63             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
64             lower_constant_f64(ctx, rd, value);
65         }
66         Opcode::Iadd => {
67             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
68             let ty = ty.unwrap();
69             if !ty.is_vector() {
70                 let mul_insn =
71                     if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) {
72                         Some((mul_insn, 0))
73                     } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
74                         Some((mul_insn, 1))
75                     } else {
76                         None
77                     };
78                 // If possible combine mul + add into madd.
79                 if let Some((insn, addend_idx)) = mul_insn {
80                     let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
81                     let rn_input = InsnInput { insn, input: 0 };
82                     let rm_input = InsnInput { insn, input: 1 };
83 
84                     let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
85                     let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
86                     let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
87 
88                     ctx.emit(Inst::AluRRRR {
89                         alu_op,
90                         rd,
91                         rn,
92                         rm,
93                         ra,
94                     });
95                 } else {
96                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
97                     let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
98                         ctx,
99                         inputs[1],
100                         ty_bits(ty),
101                         NarrowValueMode::None,
102                     );
103                     let alu_op = if !negated {
104                         choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
105                     } else {
106                         choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
107                     };
108                     ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
109                 }
110             } else {
111                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
112                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
113                 ctx.emit(Inst::VecRRR {
114                     rd,
115                     rn,
116                     rm,
117                     alu_op: VecALUOp::Add,
118                     size: VectorSize::from_ty(ty),
119                 });
120             }
121         }
122         Opcode::Isub => {
123             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
124             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
125             let ty = ty.unwrap();
126             if !ty.is_vector() {
127                 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
128                     ctx,
129                     inputs[1],
130                     ty_bits(ty),
131                     NarrowValueMode::None,
132                 );
133                 let alu_op = if !negated {
134                     choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
135                 } else {
136                     choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
137                 };
138                 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
139             } else {
140                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
141                 ctx.emit(Inst::VecRRR {
142                     rd,
143                     rn,
144                     rm,
145                     alu_op: VecALUOp::Sub,
146                     size: VectorSize::from_ty(ty),
147                 });
148             }
149         }
150         Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
151             let ty = ty.unwrap();
152             assert!(ty.is_vector());
153             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
154             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
155             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
156 
157             let alu_op = match op {
158                 Opcode::UaddSat => VecALUOp::Uqadd,
159                 Opcode::SaddSat => VecALUOp::Sqadd,
160                 Opcode::UsubSat => VecALUOp::Uqsub,
161                 Opcode::SsubSat => VecALUOp::Sqsub,
162                 _ => unreachable!(),
163             };
164 
165             ctx.emit(Inst::VecRRR {
166                 rd,
167                 rn,
168                 rm,
169                 alu_op,
170                 size: VectorSize::from_ty(ty),
171             });
172         }
173 
174         Opcode::Ineg => {
175             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
176             let ty = ty.unwrap();
177             if !ty.is_vector() {
178                 let rn = zero_reg();
179                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
180                 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
181                 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
182             } else {
183                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
184                 ctx.emit(Inst::VecMisc {
185                     op: VecMisc2::Neg,
186                     rd,
187                     rn,
188                     size: VectorSize::from_ty(ty),
189                 });
190             }
191         }
192 
193         Opcode::Imul => {
194             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
195             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
196             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
197             let ty = ty.unwrap();
198             if !ty.is_vector() {
199                 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
200                 ctx.emit(Inst::AluRRRR {
201                     alu_op,
202                     rd,
203                     rn,
204                     rm,
205                     ra: zero_reg(),
206                 });
207             } else {
208                 if ty == I64X2 {
209                     let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
210                     let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
211 
212                     // This I64X2 multiplication is performed with several 32-bit
213                     // operations.
214 
215                     // 64-bit numbers x and y, can be represented as:
216                     //   x = a + 2^32(b)
217                     //   y = c + 2^32(d)
218 
219                     // A 64-bit multiplication is:
220                     //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
221                     // note: `2^64(bd)` can be ignored, the value is too large to fit in
222                     // 64 bits.
223 
224                     // This sequence implements a I64X2 multiply, where the registers
225                     // `rn` and `rm` are split up into 32-bit components:
226                     //   rn = |d|c|b|a|
227                     //   rm = |h|g|f|e|
228                     //
229                     //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
230                     //
231                     //  The sequence is:
232                     //  rev64 rd.4s, rm.4s
233                     //  mul rd.4s, rd.4s, rn.4s
234                     //  xtn tmp1.2s, rn.2d
235                     //  addp rd.4s, rd.4s, rd.4s
236                     //  xtn tmp2.2s, rm.2d
237                     //  shll rd.2d, rd.2s, #32
238                     //  umlal rd.2d, tmp2.2s, tmp1.2s
239 
240                     // Reverse the 32-bit elements in the 64-bit words.
241                     //   rd = |g|h|e|f|
242                     ctx.emit(Inst::VecMisc {
243                         op: VecMisc2::Rev64,
244                         rd,
245                         rn: rm,
246                         size: VectorSize::Size32x4,
247                     });
248 
249                     // Calculate the high half components.
250                     //   rd = |dg|ch|be|af|
251                     //
252                     // Note that this 32-bit multiply of the high half
253                     // discards the bits that would overflow, same as
254                     // if 64-bit operations were used. Also the Shll
255                     // below would shift out the overflow bits anyway.
256                     ctx.emit(Inst::VecRRR {
257                         alu_op: VecALUOp::Mul,
258                         rd,
259                         rn: rd.to_reg(),
260                         rm: rn,
261                         size: VectorSize::Size32x4,
262                     });
263 
264                     // Extract the low half components of rn.
265                     //   tmp1 = |c|a|
266                     ctx.emit(Inst::VecMiscNarrow {
267                         op: VecMiscNarrowOp::Xtn,
268                         rd: tmp1,
269                         rn,
270                         size: VectorSize::Size32x2,
271                         high_half: false,
272                     });
273 
274                     // Sum the respective high half components.
275                     //   rd = |dg+ch|be+af||dg+ch|be+af|
276                     ctx.emit(Inst::VecRRR {
277                         alu_op: VecALUOp::Addp,
278                         rd: rd,
279                         rn: rd.to_reg(),
280                         rm: rd.to_reg(),
281                         size: VectorSize::Size32x4,
282                     });
283 
284                     // Extract the low half components of rm.
285                     //   tmp2 = |g|e|
286                     ctx.emit(Inst::VecMiscNarrow {
287                         op: VecMiscNarrowOp::Xtn,
288                         rd: tmp2,
289                         rn: rm,
290                         size: VectorSize::Size32x2,
291                         high_half: false,
292                     });
293 
294                     // Shift the high half components, into the high half.
295                     //   rd = |dg+ch << 32|be+af << 32|
296                     ctx.emit(Inst::VecMisc {
297                         op: VecMisc2::Shll,
298                         rd,
299                         rn: rd.to_reg(),
300                         size: VectorSize::Size32x2,
301                     });
302 
303                     // Multiply the low components together, and accumulate with the high
304                     // half.
305                     //   rd = |rd[1] + cg|rd[0] + ae|
306                     ctx.emit(Inst::VecRRR {
307                         alu_op: VecALUOp::Umlal,
308                         rd,
309                         rn: tmp2.to_reg(),
310                         rm: tmp1.to_reg(),
311                         size: VectorSize::Size32x2,
312                     });
313                 } else {
314                     ctx.emit(Inst::VecRRR {
315                         alu_op: VecALUOp::Mul,
316                         rd,
317                         rn,
318                         rm,
319                         size: VectorSize::from_ty(ty),
320                     });
321                 }
322             }
323         }
324 
325         Opcode::Umulhi | Opcode::Smulhi => {
326             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
327             let is_signed = op == Opcode::Smulhi;
328             let input_ty = ctx.input_ty(insn, 0);
329             assert!(ctx.input_ty(insn, 1) == input_ty);
330             assert!(ctx.output_ty(insn, 0) == input_ty);
331 
332             match input_ty {
333                 I64 => {
334                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
335                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
336                     let alu_op = if is_signed {
337                         ALUOp::SMulH
338                     } else {
339                         ALUOp::UMulH
340                     };
341                     ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
342                 }
343                 I32 | I16 | I8 => {
344                     let narrow_mode = if is_signed {
345                         NarrowValueMode::SignExtend64
346                     } else {
347                         NarrowValueMode::ZeroExtend64
348                     };
349                     let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
350                     let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
351                     let ra = zero_reg();
352                     ctx.emit(Inst::AluRRRR {
353                         alu_op: ALUOp3::MAdd64,
354                         rd,
355                         rn,
356                         rm,
357                         ra,
358                     });
359                     let shift_op = if is_signed {
360                         ALUOp::Asr64
361                     } else {
362                         ALUOp::Lsr64
363                     };
364                     let shift_amt = match input_ty {
365                         I32 => 32,
366                         I16 => 16,
367                         I8 => 8,
368                         _ => unreachable!(),
369                     };
370                     ctx.emit(Inst::AluRRImmShift {
371                         alu_op: shift_op,
372                         rd,
373                         rn: rd.to_reg(),
374                         immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
375                     });
376                 }
377                 _ => {
378                     panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
379                 }
380             }
381         }
382 
383         Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
384             let is_signed = match op {
385                 Opcode::Udiv | Opcode::Urem => false,
386                 Opcode::Sdiv | Opcode::Srem => true,
387                 _ => unreachable!(),
388             };
389             let is_rem = match op {
390                 Opcode::Udiv | Opcode::Sdiv => false,
391                 Opcode::Urem | Opcode::Srem => true,
392                 _ => unreachable!(),
393             };
394             let narrow_mode = if is_signed {
395                 NarrowValueMode::SignExtend64
396             } else {
397                 NarrowValueMode::ZeroExtend64
398             };
399             // TODO: Add SDiv32 to implement 32-bit directly, rather
400             // than extending the input.
401             let div_op = if is_signed {
402                 ALUOp::SDiv64
403             } else {
404                 ALUOp::UDiv64
405             };
406 
407             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
408             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
409             let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
410             // The div instruction does not trap on divide by zero or signed overflow
411             // so checks are inserted below.
412             //
413             //   div rd, rn, rm
414             ctx.emit(Inst::AluRRR {
415                 alu_op: div_op,
416                 rd,
417                 rn,
418                 rm,
419             });
420 
421             if is_rem {
422                 // Remainder (rn % rm) is implemented as:
423                 //
424                 //   tmp = rn / rm
425                 //   rd = rn - (tmp*rm)
426                 //
427                 // use 'rd' for tmp and you have:
428                 //
429                 //   div rd, rn, rm       ; rd = rn / rm
430                 //   cbnz rm, #8          ; branch over trap
431                 //   udf                  ; divide by zero
432                 //   msub rd, rd, rm, rn  ; rd = rn - rd * rm
433 
434                 // Check for divide by 0.
435                 let trap_code = TrapCode::IntegerDivisionByZero;
436                 ctx.emit(Inst::TrapIf {
437                     trap_code,
438                     kind: CondBrKind::Zero(rm),
439                 });
440 
441                 ctx.emit(Inst::AluRRRR {
442                     alu_op: ALUOp3::MSub64,
443                     rd: rd,
444                     rn: rd.to_reg(),
445                     rm: rm,
446                     ra: rn,
447                 });
448             } else {
449                 if div_op == ALUOp::SDiv64 {
450                     //   cbnz rm, #8
451                     //   udf ; divide by zero
452                     //   cmn rm, 1
453                     //   ccmp rn, 1, #nzcv, eq
454                     //   b.vc #8
455                     //   udf ; signed overflow
456 
457                     // Check for divide by 0.
458                     let trap_code = TrapCode::IntegerDivisionByZero;
459                     ctx.emit(Inst::TrapIf {
460                         trap_code,
461                         kind: CondBrKind::Zero(rm),
462                     });
463 
464                     // Check for signed overflow. The only case is min_value / -1.
465                     let ty = ty.unwrap();
466                     // The following checks must be done in 32-bit or 64-bit, depending
467                     // on the input type. Even though the initial div instruction is
468                     // always done in 64-bit currently.
469                     let size = OperandSize::from_ty(ty);
470                     // Check RHS is -1.
471                     ctx.emit(Inst::AluRRImm12 {
472                         alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
473                         rd: writable_zero_reg(),
474                         rn: rm,
475                         imm12: Imm12::maybe_from_u64(1).unwrap(),
476                     });
477                     // Check LHS is min_value, by subtracting 1 and branching if
478                     // there is overflow.
479                     ctx.emit(Inst::CCmpImm {
480                         size,
481                         rn,
482                         imm: UImm5::maybe_from_u8(1).unwrap(),
483                         nzcv: NZCV::new(false, false, false, false),
484                         cond: Cond::Eq,
485                     });
486                     let trap_code = TrapCode::IntegerOverflow;
487                     ctx.emit(Inst::TrapIf {
488                         trap_code,
489                         kind: CondBrKind::Cond(Cond::Vs),
490                     });
491                 } else {
492                     //   cbnz rm, #8
493                     //   udf ; divide by zero
494 
495                     // Check for divide by 0.
496                     let trap_code = TrapCode::IntegerDivisionByZero;
497                     ctx.emit(Inst::TrapIf {
498                         trap_code,
499                         kind: CondBrKind::Zero(rm),
500                     });
501                 }
502             }
503         }
504 
505         Opcode::Uextend | Opcode::Sextend => {
506             let output_ty = ty.unwrap();
507             let input_ty = ctx.input_ty(insn, 0);
508             let from_bits = ty_bits(input_ty) as u8;
509             let to_bits = ty_bits(output_ty) as u8;
510             let to_bits = std::cmp::max(32, to_bits);
511             assert!(from_bits <= to_bits);
512             if from_bits < to_bits {
513                 let signed = op == Opcode::Sextend;
514                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
515 
516                 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
517                     let idx =
518                         if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
519                             *imm
520                         } else {
521                             unreachable!();
522                         };
523                     let input = InsnInput {
524                         insn: extract_insn,
525                         input: 0,
526                     };
527                     let rn = put_input_in_reg(ctx, input, NarrowValueMode::None);
528                     let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
529 
530                     if signed {
531                         let scalar_size = OperandSize::from_ty(output_ty);
532 
533                         ctx.emit(Inst::MovFromVecSigned {
534                             rd,
535                             rn,
536                             idx,
537                             size,
538                             scalar_size,
539                         });
540                     } else {
541                         ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
542                     }
543                 } else {
544                     // If we reach this point, we weren't able to incorporate the extend as
545                     // a register-mode on another instruction, so we have a 'None'
546                     // narrow-value/extend mode here, and we emit the explicit instruction.
547                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
548                     ctx.emit(Inst::Extend {
549                         rd,
550                         rn,
551                         signed,
552                         from_bits,
553                         to_bits,
554                     });
555                 }
556             }
557         }
558 
559         Opcode::Bnot => {
560             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
561             let ty = ty.unwrap();
562             if !ty.is_vector() {
563                 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
564                 let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
565                 // NOT rd, rm ==> ORR_NOT rd, zero, rm
566                 ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
567             } else {
568                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
569                 ctx.emit(Inst::VecMisc {
570                     op: VecMisc2::Not,
571                     rd,
572                     rn: rm,
573                     size: VectorSize::from_ty(ty),
574                 });
575             }
576         }
577 
578         Opcode::Band
579         | Opcode::Bor
580         | Opcode::Bxor
581         | Opcode::BandNot
582         | Opcode::BorNot
583         | Opcode::BxorNot => {
584             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
585             let ty = ty.unwrap();
586             if !ty.is_vector() {
587                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
588                 let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
589                 let alu_op = match op {
590                     Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
591                     Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
592                     Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
593                     Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
594                     Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
595                     Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
596                     _ => unreachable!(),
597                 };
598                 ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
599             } else {
600                 let alu_op = match op {
601                     Opcode::Band => VecALUOp::And,
602                     Opcode::BandNot => VecALUOp::Bic,
603                     Opcode::Bor => VecALUOp::Orr,
604                     Opcode::Bxor => VecALUOp::Eor,
605                     _ => unreachable!(),
606                 };
607 
608                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
609                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
610                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
611 
612                 ctx.emit(Inst::VecRRR {
613                     alu_op,
614                     rd,
615                     rn,
616                     rm,
617                     size: VectorSize::from_ty(ty),
618                 });
619             }
620         }
621 
622         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
623             let ty = ty.unwrap();
624             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
625             if !ty.is_vector() {
626                 let size = OperandSize::from_bits(ty_bits(ty));
627                 let narrow_mode = match (op, size) {
628                     (Opcode::Ishl, _) => NarrowValueMode::None,
629                     (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
630                     (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
631                     (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
632                     (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
633                     _ => unreachable!(),
634                 };
635                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
636                 let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
637                 let alu_op = match op {
638                     Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
639                     Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
640                     Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
641                     _ => unreachable!(),
642                 };
643                 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
644             } else {
645                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
646                 let size = VectorSize::from_ty(ty);
647                 let (alu_op, is_right_shift) = match op {
648                     Opcode::Ishl => (VecALUOp::Sshl, false),
649                     Opcode::Ushr => (VecALUOp::Ushl, true),
650                     Opcode::Sshr => (VecALUOp::Sshl, true),
651                     _ => unreachable!(),
652                 };
653 
654                 let rm = if is_right_shift {
655                     // Right shifts are implemented with a negative left shift.
656                     let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
657                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
658                     let rn = zero_reg();
659                     ctx.emit(Inst::AluRRR {
660                         alu_op: ALUOp::Sub32,
661                         rd: tmp,
662                         rn,
663                         rm,
664                     });
665                     tmp.to_reg()
666                 } else {
667                     put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
668                 };
669 
670                 ctx.emit(Inst::VecDup { rd, rn: rm, size });
671 
672                 ctx.emit(Inst::VecRRR {
673                     alu_op,
674                     rd,
675                     rn,
676                     rm: rd.to_reg(),
677                     size,
678                 });
679             }
680         }
681 
682         Opcode::Rotr | Opcode::Rotl => {
683             // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
684             // effectively a right rotation of N - K places, if N is the integer's bit size. We
685             // implement left rotations with this trick.
686             //
687             // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
688             //
689             // For a < 32-bit rotate-right, we synthesize this as:
690             //
691             //    rotr rd, rn, rm
692             //
693             //       =>
694             //
695             //    zero-extend rn, <32-or-64>
696             //    and tmp_masked_rm, rm, <bitwidth - 1>
697             //    sub tmp1, tmp_masked_rm, <bitwidth>
698             //    sub tmp1, zero, tmp1  ; neg
699             //    lsr tmp2, rn, tmp_masked_rm
700             //    lsl rd, rn, tmp1
701             //    orr rd, rd, tmp2
702             //
703             // For a constant amount, we can instead do:
704             //
705             //    zero-extend rn, <32-or-64>
706             //    lsr tmp2, rn, #<shiftimm>
707             //    lsl rd, rn, <bitwidth - shiftimm>
708             //    orr rd, rd, tmp2
709 
710             let is_rotl = op == Opcode::Rotl;
711 
712             let ty = ty.unwrap();
713             let ty_bits_size = ty_bits(ty) as u8;
714 
715             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
716             let rn = put_input_in_reg(
717                 ctx,
718                 inputs[0],
719                 if ty_bits_size <= 32 {
720                     NarrowValueMode::ZeroExtend32
721                 } else {
722                     NarrowValueMode::ZeroExtend64
723                 },
724             );
725             let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
726 
727             if ty_bits_size == 32 || ty_bits_size == 64 {
728                 let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
729                 match rm {
730                     ResultRegImmShift::ImmShift(mut immshift) => {
731                         if is_rotl {
732                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
733                         }
734                         immshift.imm &= ty_bits_size - 1;
735                         ctx.emit(Inst::AluRRImmShift {
736                             alu_op,
737                             rd,
738                             rn,
739                             immshift,
740                         });
741                     }
742 
743                     ResultRegImmShift::Reg(rm) => {
744                         let rm = if is_rotl {
745                             // Really ty_bits_size - rn, but the upper bits of the result are
746                             // ignored (because of the implicit masking done by the instruction),
747                             // so this is equivalent to negating the input.
748                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
749                             let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
750                             ctx.emit(Inst::AluRRR {
751                                 alu_op,
752                                 rd: tmp,
753                                 rn: zero_reg(),
754                                 rm,
755                             });
756                             tmp.to_reg()
757                         } else {
758                             rm
759                         };
760                         ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
761                     }
762                 }
763             } else {
764                 debug_assert!(ty_bits_size < 32);
765 
766                 match rm {
767                     ResultRegImmShift::Reg(reg) => {
768                         let reg = if is_rotl {
769                             // Really ty_bits_size - rn, but the upper bits of the result are
770                             // ignored (because of the implicit masking done by the instruction),
771                             // so this is equivalent to negating the input.
772                             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
773                             ctx.emit(Inst::AluRRR {
774                                 alu_op: ALUOp::Sub32,
775                                 rd: tmp,
776                                 rn: zero_reg(),
777                                 rm: reg,
778                             });
779                             tmp.to_reg()
780                         } else {
781                             reg
782                         };
783 
784                         // Explicitly mask the rotation count.
785                         let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
786                         ctx.emit(Inst::AluRRImmLogic {
787                             alu_op: ALUOp::And32,
788                             rd: tmp_masked_rm,
789                             rn: reg,
790                             imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
791                         });
792                         let tmp_masked_rm = tmp_masked_rm.to_reg();
793 
794                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
795                         let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
796                         ctx.emit(Inst::AluRRImm12 {
797                             alu_op: ALUOp::Sub32,
798                             rd: tmp1,
799                             rn: tmp_masked_rm,
800                             imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
801                         });
802                         ctx.emit(Inst::AluRRR {
803                             alu_op: ALUOp::Sub32,
804                             rd: tmp1,
805                             rn: zero_reg(),
806                             rm: tmp1.to_reg(),
807                         });
808                         ctx.emit(Inst::AluRRR {
809                             alu_op: ALUOp::Lsr32,
810                             rd: tmp2,
811                             rn,
812                             rm: tmp_masked_rm,
813                         });
814                         ctx.emit(Inst::AluRRR {
815                             alu_op: ALUOp::Lsl32,
816                             rd,
817                             rn,
818                             rm: tmp1.to_reg(),
819                         });
820                         ctx.emit(Inst::AluRRR {
821                             alu_op: ALUOp::Orr32,
822                             rd,
823                             rn: rd.to_reg(),
824                             rm: tmp2.to_reg(),
825                         });
826                     }
827 
828                     ResultRegImmShift::ImmShift(mut immshift) => {
829                         if is_rotl {
830                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
831                         }
832                         immshift.imm &= ty_bits_size - 1;
833 
834                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
835                         ctx.emit(Inst::AluRRImmShift {
836                             alu_op: ALUOp::Lsr32,
837                             rd: tmp1,
838                             rn,
839                             immshift: immshift.clone(),
840                         });
841 
842                         let amount = immshift.value() & (ty_bits_size - 1);
843                         let opp_shift =
844                             ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
845                         ctx.emit(Inst::AluRRImmShift {
846                             alu_op: ALUOp::Lsl32,
847                             rd,
848                             rn,
849                             immshift: opp_shift,
850                         });
851 
852                         ctx.emit(Inst::AluRRR {
853                             alu_op: ALUOp::Orr32,
854                             rd,
855                             rn: rd.to_reg(),
856                             rm: tmp1.to_reg(),
857                         });
858                     }
859                 }
860             }
861         }
862 
863         Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
864             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
865             let needs_zext = match op {
866                 Opcode::Bitrev | Opcode::Ctz => false,
867                 Opcode::Clz | Opcode::Cls => true,
868                 _ => unreachable!(),
869             };
870             let ty = ty.unwrap();
871             let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
872                 NarrowValueMode::ZeroExtend64
873             } else if needs_zext {
874                 NarrowValueMode::ZeroExtend32
875             } else {
876                 NarrowValueMode::None
877             };
878             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
879             let op_ty = match ty {
880                 I8 | I16 | I32 => I32,
881                 I64 => I64,
882                 _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
883             };
884             let bitop = match op {
885                 Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
886                 Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
887                 _ => unreachable!(),
888             };
889             ctx.emit(Inst::BitRR { rd, rn, op: bitop });
890 
891             // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
892             // to a clz, and bitrev as the main operation.
893             if op == Opcode::Bitrev || op == Opcode::Ctz {
894                 // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
895                 // the reversed result in the highest n bits, so we need to shift them down into
896                 // place.
897                 let right_shift = match ty {
898                     I8 => Some(24),
899                     I16 => Some(16),
900                     I32 => None,
901                     I64 => None,
902                     _ => panic!("Unsupported type for Bitrev"),
903                 };
904                 if let Some(s) = right_shift {
905                     ctx.emit(Inst::AluRRImmShift {
906                         alu_op: ALUOp::Lsr32,
907                         rd,
908                         rn: rd.to_reg(),
909                         immshift: ImmShift::maybe_from_u64(s).unwrap(),
910                     });
911                 }
912             }
913 
914             if op == Opcode::Ctz {
915                 ctx.emit(Inst::BitRR {
916                     op: BitOp::from((Opcode::Clz, op_ty)),
917                     rd,
918                     rn: rd.to_reg(),
919                 });
920             }
921         }
922 
923         Opcode::Popcnt => {
924             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
925             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
926             let ty = ty.unwrap();
927             let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
928             let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
929 
930             // fmov tmp, rn
931             // cnt tmp.8b, tmp.8b
932             // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
933             // umov rd, tmp.b[0]
934 
935             ctx.emit(Inst::MovToFpu {
936                 rd: tmp,
937                 rn: rn,
938                 size,
939             });
940             ctx.emit(Inst::VecMisc {
941                 op: VecMisc2::Cnt,
942                 rd: tmp,
943                 rn: tmp.to_reg(),
944                 size: VectorSize::Size8x8,
945             });
946 
947             match ScalarSize::from_ty(ty) {
948                 ScalarSize::Size8 => {}
949                 ScalarSize::Size16 => {
950                     // ADDP is usually cheaper than ADDV.
951                     ctx.emit(Inst::VecRRR {
952                         alu_op: VecALUOp::Addp,
953                         rd: tmp,
954                         rn: tmp.to_reg(),
955                         rm: tmp.to_reg(),
956                         size: VectorSize::Size8x8,
957                     });
958                 }
959                 ScalarSize::Size32 | ScalarSize::Size64 => {
960                     ctx.emit(Inst::VecLanes {
961                         op: VecLanesOp::Addv,
962                         rd: tmp,
963                         rn: tmp.to_reg(),
964                         size: VectorSize::Size8x8,
965                     });
966                 }
967                 sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
968             }
969 
970             ctx.emit(Inst::MovFromVec {
971                 rd,
972                 rn: tmp.to_reg(),
973                 idx: 0,
974                 size: VectorSize::Size8x16,
975             });
976         }
977 
978         Opcode::Load
979         | Opcode::Uload8
980         | Opcode::Sload8
981         | Opcode::Uload16
982         | Opcode::Sload16
983         | Opcode::Uload32
984         | Opcode::Sload32
985         | Opcode::LoadComplex
986         | Opcode::Uload8Complex
987         | Opcode::Sload8Complex
988         | Opcode::Uload16Complex
989         | Opcode::Sload16Complex
990         | Opcode::Uload32Complex
991         | Opcode::Sload32Complex
992         | Opcode::Sload8x8
993         | Opcode::Uload8x8
994         | Opcode::Sload16x4
995         | Opcode::Uload16x4
996         | Opcode::Sload32x2
997         | Opcode::Uload32x2
998         | Opcode::Uload8x8Complex
999         | Opcode::Sload8x8Complex
1000         | Opcode::Uload16x4Complex
1001         | Opcode::Sload16x4Complex
1002         | Opcode::Uload32x2Complex
1003         | Opcode::Sload32x2Complex => {
1004             let sign_extend = match op {
1005                 Opcode::Sload8
1006                 | Opcode::Sload8Complex
1007                 | Opcode::Sload16
1008                 | Opcode::Sload16Complex
1009                 | Opcode::Sload32
1010                 | Opcode::Sload32Complex => true,
1011                 _ => false,
1012             };
1013             let flags = ctx
1014                 .memflags(insn)
1015                 .expect("Load instruction should have memflags");
1016 
1017             lower_load(
1018                 ctx,
1019                 insn,
1020                 &inputs[..],
1021                 outputs[0],
1022                 |ctx, rd, elem_ty, mem| {
1023                     let is_float = ty_has_float_or_vec_representation(elem_ty);
1024                     ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
1025                         (1, _, _) => Inst::ULoad8 { rd, mem, flags },
1026                         (8, false, _) => Inst::ULoad8 { rd, mem, flags },
1027                         (8, true, _) => Inst::SLoad8 { rd, mem, flags },
1028                         (16, false, _) => Inst::ULoad16 { rd, mem, flags },
1029                         (16, true, _) => Inst::SLoad16 { rd, mem, flags },
1030                         (32, false, false) => Inst::ULoad32 { rd, mem, flags },
1031                         (32, true, false) => Inst::SLoad32 { rd, mem, flags },
1032                         (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
1033                         (64, _, false) => Inst::ULoad64 { rd, mem, flags },
1034                         // Note that we treat some of the vector loads as scalar floating-point loads,
1035                         // which is correct in a little endian environment.
1036                         (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
1037                         (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
1038                         _ => panic!("Unsupported size in load"),
1039                     });
1040 
1041                     let vec_extend = match op {
1042                         Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
1043                         Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
1044                         Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
1045                         Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
1046                         Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
1047                         Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
1048                         Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
1049                         Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
1050                         Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
1051                         Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
1052                         Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
1053                         Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
1054                         _ => None,
1055                     };
1056 
1057                     if let Some(t) = vec_extend {
1058                         ctx.emit(Inst::VecExtend {
1059                             t,
1060                             rd,
1061                             rn: rd.to_reg(),
1062                             high_half: false,
1063                         });
1064                     }
1065                 },
1066             );
1067         }
1068 
1069         Opcode::Store
1070         | Opcode::Istore8
1071         | Opcode::Istore16
1072         | Opcode::Istore32
1073         | Opcode::StoreComplex
1074         | Opcode::Istore8Complex
1075         | Opcode::Istore16Complex
1076         | Opcode::Istore32Complex => {
1077             let off = ctx.data(insn).load_store_offset().unwrap();
1078             let elem_ty = match op {
1079                 Opcode::Istore8 | Opcode::Istore8Complex => I8,
1080                 Opcode::Istore16 | Opcode::Istore16Complex => I16,
1081                 Opcode::Istore32 | Opcode::Istore32Complex => I32,
1082                 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
1083                 _ => unreachable!(),
1084             };
1085             let is_float = ty_has_float_or_vec_representation(elem_ty);
1086             let flags = ctx
1087                 .memflags(insn)
1088                 .expect("Store instruction should have memflags");
1089 
1090             let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
1091             let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1092 
1093             ctx.emit(match (ty_bits(elem_ty), is_float) {
1094                 (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
1095                 (16, _) => Inst::Store16 { rd, mem, flags },
1096                 (32, false) => Inst::Store32 { rd, mem, flags },
1097                 (32, true) => Inst::FpuStore32 { rd, mem, flags },
1098                 (64, false) => Inst::Store64 { rd, mem, flags },
1099                 (64, true) => Inst::FpuStore64 { rd, mem, flags },
1100                 (128, _) => Inst::FpuStore128 { rd, mem, flags },
1101                 _ => panic!("Unsupported size in store"),
1102             });
1103         }
1104 
1105         Opcode::StackAddr => {
1106             let (stack_slot, offset) = match *ctx.data(insn) {
1107                 InstructionData::StackLoad {
1108                     opcode: Opcode::StackAddr,
1109                     stack_slot,
1110                     offset,
1111                 } => (stack_slot, offset),
1112                 _ => unreachable!(),
1113             };
1114             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1115             let offset: i32 = offset.into();
1116             let inst = ctx
1117                 .abi()
1118                 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
1119             ctx.emit(inst);
1120         }
1121 
1122         Opcode::AtomicRmw => {
1123             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1124             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1125             let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1126             let ty_access = ty.unwrap();
1127             assert!(is_valid_atomic_transaction_ty(ty_access));
1128             // Make sure that both args are in virtual regs, since in effect
1129             // we have to do a parallel copy to get them safely to the AtomicRMW input
1130             // regs, and that's not guaranteed safe if either is in a real reg.
1131             r_addr = ctx.ensure_in_vreg(r_addr, I64);
1132             r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
1133             // Move the args to the preordained AtomicRMW input regs
1134             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1135             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
1136             // Now the AtomicRMW insn itself
1137             let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
1138             ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
1139             // And finally, copy the preordained AtomicRMW output reg to its destination.
1140             ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1141             // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
1142         }
1143 
1144         Opcode::AtomicCas => {
1145             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1146             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1147             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1148             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1149             let ty_access = ty.unwrap();
1150             assert!(is_valid_atomic_transaction_ty(ty_access));
1151 
1152             if isa_flags.use_lse() {
1153                 ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
1154                 ctx.emit(Inst::AtomicCAS {
1155                     rs: r_dst,
1156                     rt: r_replacement,
1157                     rn: r_addr,
1158                     ty: ty_access,
1159                 });
1160             } else {
1161                 // This is very similar to, but not identical to, the AtomicRmw case.  Note
1162                 // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
1163                 // about zero-extending narrow (I8/I16/I32) values here.
1164                 // Make sure that all three args are in virtual regs.  See corresponding comment
1165                 // for `Opcode::AtomicRmw` above.
1166                 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1167                 r_expected = ctx.ensure_in_vreg(r_expected, I64);
1168                 r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
1169                 // Move the args to the preordained AtomicCASLoop input regs
1170                 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1171                 ctx.emit(Inst::gen_move(
1172                     Writable::from_reg(xreg(26)),
1173                     r_expected,
1174                     I64,
1175                 ));
1176                 ctx.emit(Inst::gen_move(
1177                     Writable::from_reg(xreg(28)),
1178                     r_replacement,
1179                     I64,
1180                 ));
1181                 // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
1182                 ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
1183                 // And finally, copy the preordained AtomicCASLoop output reg to its destination.
1184                 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1185                 // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
1186             }
1187         }
1188 
1189         Opcode::AtomicLoad => {
1190             let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1191             let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1192             let ty_access = ty.unwrap();
1193             assert!(is_valid_atomic_transaction_ty(ty_access));
1194             ctx.emit(Inst::AtomicLoad {
1195                 ty: ty_access,
1196                 r_data,
1197                 r_addr,
1198             });
1199         }
1200 
1201         Opcode::AtomicStore => {
1202             let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1203             let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1204             let ty_access = ctx.input_ty(insn, 0);
1205             assert!(is_valid_atomic_transaction_ty(ty_access));
1206             ctx.emit(Inst::AtomicStore {
1207                 ty: ty_access,
1208                 r_data,
1209                 r_addr,
1210             });
1211         }
1212 
1213         Opcode::Fence => {
1214             ctx.emit(Inst::Fence {});
1215         }
1216 
1217         Opcode::StackLoad | Opcode::StackStore => {
1218             panic!("Direct stack memory access not supported; should not be used by Wasm");
1219         }
1220 
1221         Opcode::HeapAddr => {
1222             panic!("heap_addr should have been removed by legalization!");
1223         }
1224 
1225         Opcode::TableAddr => {
1226             panic!("table_addr should have been removed by legalization!");
1227         }
1228 
1229         Opcode::ConstAddr => unimplemented!(),
1230 
1231         Opcode::Nop => {
1232             // Nothing.
1233         }
1234 
1235         Opcode::Select => {
1236             let flag_input = inputs[0];
1237             let cond = if let Some(icmp_insn) =
1238                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
1239             {
1240                 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
1241                 let cond = lower_condcode(condcode);
1242                 let is_signed = condcode_is_signed(condcode);
1243                 lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
1244                 cond
1245             } else if let Some(fcmp_insn) =
1246                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
1247             {
1248                 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
1249                 let cond = lower_fp_condcode(condcode);
1250                 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
1251                 cond
1252             } else {
1253                 let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
1254                     (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
1255                 } else {
1256                     (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
1257                 };
1258 
1259                 let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
1260                 // cmp rcond, #0
1261                 ctx.emit(Inst::AluRRR {
1262                     alu_op: cmp_op,
1263                     rd: writable_zero_reg(),
1264                     rn: rcond,
1265                     rm: zero_reg(),
1266                 });
1267                 Cond::Ne
1268             };
1269 
1270             // csel.cond rd, rn, rm
1271             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1272             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1273             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1274             let ty = ctx.output_ty(insn, 0);
1275             let bits = ty_bits(ty);
1276             let is_float = ty_has_float_or_vec_representation(ty);
1277             if is_float && bits == 32 {
1278                 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1279             } else if is_float && bits == 64 {
1280                 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1281             } else if is_float && bits == 128 {
1282                 ctx.emit(Inst::VecCSel { cond, rd, rn, rm });
1283             } else {
1284                 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1285             }
1286         }
1287 
1288         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
1289             let condcode = ctx.data(insn).cond_code().unwrap();
1290             let cond = lower_condcode(condcode);
1291             let is_signed = condcode_is_signed(condcode);
1292             // Verification ensures that the input is always a
1293             // single-def ifcmp.
1294             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1295             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1296 
1297             // csel.COND rd, rn, rm
1298             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1299             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1300             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1301             let ty = ctx.output_ty(insn, 0);
1302             let bits = ty_bits(ty);
1303             let is_float = ty_has_float_or_vec_representation(ty);
1304             if is_float && bits == 32 {
1305                 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1306             } else if is_float && bits == 64 {
1307                 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1308             } else {
1309                 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1310             }
1311         }
1312 
1313         Opcode::Bitselect | Opcode::Vselect => {
1314             let ty = ty.unwrap();
1315             if !ty.is_vector() {
1316                 debug_assert_ne!(Opcode::Vselect, op);
1317                 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1318                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1319                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1320                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1321                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1322                 // AND rTmp, rn, rcond
1323                 ctx.emit(Inst::AluRRR {
1324                     alu_op: ALUOp::And64,
1325                     rd: tmp,
1326                     rn,
1327                     rm: rcond,
1328                 });
1329                 // BIC rd, rm, rcond
1330                 ctx.emit(Inst::AluRRR {
1331                     alu_op: ALUOp::AndNot64,
1332                     rd,
1333                     rn: rm,
1334                     rm: rcond,
1335                 });
1336                 // ORR rd, rd, rTmp
1337                 ctx.emit(Inst::AluRRR {
1338                     alu_op: ALUOp::Orr64,
1339                     rd,
1340                     rn: rd.to_reg(),
1341                     rm: tmp.to_reg(),
1342                 });
1343             } else {
1344                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1345                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1346                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1347                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1348                 ctx.emit(Inst::gen_move(rd, rcond, ty));
1349 
1350                 ctx.emit(Inst::VecRRR {
1351                     alu_op: VecALUOp::Bsl,
1352                     rd,
1353                     rn,
1354                     rm,
1355                     size: VectorSize::from_ty(ty),
1356                 });
1357             }
1358         }
1359 
1360         Opcode::Trueif => {
1361             let condcode = ctx.data(insn).cond_code().unwrap();
1362             let cond = lower_condcode(condcode);
1363             let is_signed = condcode_is_signed(condcode);
1364             // Verification ensures that the input is always a
1365             // single-def ifcmp.
1366             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1367             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1368             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1369             materialize_bool_result(ctx, insn, rd, cond);
1370         }
1371 
1372         Opcode::Trueff => {
1373             let condcode = ctx.data(insn).fp_cond_code().unwrap();
1374             let cond = lower_fp_condcode(condcode);
1375             let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1376             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1377             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1378             materialize_bool_result(ctx, insn, rd, cond);
1379         }
1380 
1381         Opcode::IsNull | Opcode::IsInvalid => {
1382             // Null references are represented by the constant value 0; invalid references are
1383             // represented by the constant value -1. See `define_reftypes()` in
1384             // `meta/src/isa/x86/encodings.rs` to confirm.
1385             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1386             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1387             let ty = ctx.input_ty(insn, 0);
1388             let (alu_op, const_value) = match op {
1389                 Opcode::IsNull => {
1390                     // cmp rn, #0
1391                     (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
1392                 }
1393                 Opcode::IsInvalid => {
1394                     // cmn rn, #1
1395                     (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
1396                 }
1397                 _ => unreachable!(),
1398             };
1399             let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
1400             ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
1401             materialize_bool_result(ctx, insn, rd, Cond::Eq);
1402         }
1403 
1404         Opcode::Copy => {
1405             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1406             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1407             let ty = ctx.input_ty(insn, 0);
1408             ctx.emit(Inst::gen_move(rd, rn, ty));
1409         }
1410 
1411         Opcode::Breduce | Opcode::Ireduce => {
1412             // Smaller integers/booleans are stored with high-order bits
1413             // undefined, so we can simply do a copy.
1414             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1415             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1416             let ty = ctx.input_ty(insn, 0);
1417             ctx.emit(Inst::gen_move(rd, rn, ty));
1418         }
1419 
1420         Opcode::Bextend | Opcode::Bmask => {
1421             // Bextend and Bmask both simply sign-extend. This works for:
1422             // - Bextend, because booleans are stored as 0 / -1, so we
1423             //   sign-extend the -1 to a -1 in the wider width.
1424             // - Bmask, because the resulting integer mask value must be
1425             //   all-ones (-1) if the argument is true.
1426 
1427             let from_ty = ctx.input_ty(insn, 0);
1428             let to_ty = ctx.output_ty(insn, 0);
1429             let from_bits = ty_bits(from_ty);
1430             let to_bits = ty_bits(to_ty);
1431 
1432             assert!(
1433                 from_bits <= 64 && to_bits <= 64,
1434                 "Vector Bextend not supported yet"
1435             );
1436             assert!(from_bits <= to_bits);
1437 
1438             if from_bits == to_bits {
1439                 // Nothing.
1440             } else {
1441                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1442                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1443                 let to_bits = if to_bits == 64 {
1444                     64
1445                 } else {
1446                     assert!(to_bits <= 32);
1447                     32
1448                 };
1449                 let from_bits = from_bits as u8;
1450                 ctx.emit(Inst::Extend {
1451                     rd,
1452                     rn,
1453                     signed: true,
1454                     from_bits,
1455                     to_bits,
1456                 });
1457             }
1458         }
1459 
1460         Opcode::Bint => {
1461             // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
1462             // out the LSB to give a 0 / 1-valued integer result.
1463             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1464             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1465             let output_bits = ty_bits(ctx.output_ty(insn, 0));
1466 
1467             let (imm_ty, alu_op) = if output_bits > 32 {
1468                 (I64, ALUOp::And64)
1469             } else {
1470                 (I32, ALUOp::And32)
1471             };
1472             ctx.emit(Inst::AluRRImmLogic {
1473                 alu_op,
1474                 rd,
1475                 rn,
1476                 imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
1477             });
1478         }
1479 
1480         Opcode::Bitcast => {
1481             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1482             let ity = ctx.input_ty(insn, 0);
1483             let oty = ctx.output_ty(insn, 0);
1484             let ity_bits = ty_bits(ity);
1485             let ity_vec_reg = ty_has_float_or_vec_representation(ity);
1486             let oty_bits = ty_bits(oty);
1487             let oty_vec_reg = ty_has_float_or_vec_representation(oty);
1488 
1489             debug_assert_eq!(ity_bits, oty_bits);
1490 
1491             match (ity_vec_reg, oty_vec_reg) {
1492                 (true, true) => {
1493                     let narrow_mode = if ity_bits <= 32 {
1494                         NarrowValueMode::ZeroExtend32
1495                     } else {
1496                         NarrowValueMode::ZeroExtend64
1497                     };
1498                     let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
1499                     ctx.emit(Inst::gen_move(rd, rm, oty));
1500                 }
1501                 (false, false) => {
1502                     let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1503                     ctx.emit(Inst::gen_move(rd, rm, oty));
1504                 }
1505                 (false, true) => {
1506                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1507                     ctx.emit(Inst::MovToFpu {
1508                         rd,
1509                         rn,
1510                         size: ScalarSize::Size64,
1511                     });
1512                 }
1513                 (true, false) => {
1514                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1515                     let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
1516 
1517                     ctx.emit(Inst::MovFromVec {
1518                         rd,
1519                         rn,
1520                         idx: 0,
1521                         size,
1522                     });
1523                 }
1524             }
1525         }
1526 
1527         Opcode::FallthroughReturn | Opcode::Return => {
1528             for (i, input) in inputs.iter().enumerate() {
1529                 // N.B.: according to the AArch64 ABI, the top bits of a register
1530                 // (above the bits for the value's type) are undefined, so we
1531                 // need not extend the return values.
1532                 let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
1533                 let retval_reg = ctx.retval(i).only_reg().unwrap();
1534                 let ty = ctx.input_ty(insn, i);
1535                 ctx.emit(Inst::gen_move(retval_reg, reg, ty));
1536             }
1537             // N.B.: the Ret itself is generated by the ABI.
1538         }
1539 
1540         Opcode::Ifcmp | Opcode::Ffcmp => {
1541             // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
1542             // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
1543             // the same block, or a dominating block. In other words, it cannot pass through a BB
1544             // param (phi). The flags pass of the verifier will ensure this.
1545             panic!("Should never reach ifcmp as isel root!");
1546         }
1547 
1548         Opcode::Icmp => {
1549             let condcode = ctx.data(insn).cond_code().unwrap();
1550             let cond = lower_condcode(condcode);
1551             let is_signed = condcode_is_signed(condcode);
1552             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1553             let ty = ctx.input_ty(insn, 0);
1554             let bits = ty_bits(ty);
1555             let narrow_mode = match (bits <= 32, is_signed) {
1556                 (true, true) => NarrowValueMode::SignExtend32,
1557                 (true, false) => NarrowValueMode::ZeroExtend32,
1558                 (false, true) => NarrowValueMode::SignExtend64,
1559                 (false, false) => NarrowValueMode::ZeroExtend64,
1560             };
1561             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1562 
1563             if !ty.is_vector() {
1564                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
1565                 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
1566                 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
1567                 materialize_bool_result(ctx, insn, rd, cond);
1568             } else {
1569                 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
1570                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1571             }
1572         }
1573 
1574         Opcode::Fcmp => {
1575             let condcode = ctx.data(insn).fp_cond_code().unwrap();
1576             let cond = lower_fp_condcode(condcode);
1577             let ty = ctx.input_ty(insn, 0);
1578             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1579             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1580             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1581 
1582             if !ty.is_vector() {
1583                 match ty_bits(ty) {
1584                     32 => {
1585                         ctx.emit(Inst::FpuCmp32 { rn, rm });
1586                     }
1587                     64 => {
1588                         ctx.emit(Inst::FpuCmp64 { rn, rm });
1589                     }
1590                     _ => panic!("Bad float size"),
1591                 }
1592                 materialize_bool_result(ctx, insn, rd, cond);
1593             } else {
1594                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1595             }
1596         }
1597 
1598         Opcode::JumpTableEntry | Opcode::JumpTableBase => {
1599             panic!("Should not appear: we handle BrTable directly");
1600         }
1601 
1602         Opcode::Debugtrap => {
1603             ctx.emit(Inst::Brk);
1604         }
1605 
1606         Opcode::Trap | Opcode::ResumableTrap => {
1607             let trap_code = ctx.data(insn).trap_code().unwrap();
1608             ctx.emit_safepoint(Inst::Udf { trap_code });
1609         }
1610 
1611         Opcode::Trapif | Opcode::Trapff => {
1612             let trap_code = ctx.data(insn).trap_code().unwrap();
1613 
1614             let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
1615                 let condcode = ctx.data(insn).cond_code().unwrap();
1616                 let cond = lower_condcode(condcode);
1617                 // The flags must not have been clobbered by any other
1618                 // instruction between the iadd_ifcout and this instruction, as
1619                 // verified by the CLIF validator; so we can simply use the
1620                 // flags here.
1621                 cond
1622             } else if op == Opcode::Trapif {
1623                 let condcode = ctx.data(insn).cond_code().unwrap();
1624                 let cond = lower_condcode(condcode);
1625                 let is_signed = condcode_is_signed(condcode);
1626 
1627                 // Verification ensures that the input is always a single-def ifcmp.
1628                 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1629                 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1630                 cond
1631             } else {
1632                 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1633                 let cond = lower_fp_condcode(condcode);
1634 
1635                 // Verification ensures that the input is always a
1636                 // single-def ffcmp.
1637                 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1638                 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1639                 cond
1640             };
1641 
1642             ctx.emit_safepoint(Inst::TrapIf {
1643                 trap_code,
1644                 kind: CondBrKind::Cond(cond),
1645             });
1646         }
1647 
1648         Opcode::Safepoint => {
1649             panic!("safepoint instructions not used by new backend's safepoints!");
1650         }
1651 
1652         Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
1653             panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
1654         }
1655 
1656         Opcode::FuncAddr => {
1657             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1658             let (extname, _) = ctx.call_target(insn).unwrap();
1659             let extname = extname.clone();
1660             ctx.emit(Inst::LoadExtName {
1661                 rd,
1662                 name: Box::new(extname),
1663                 offset: 0,
1664             });
1665         }
1666 
1667         Opcode::GlobalValue => {
1668             panic!("global_value should have been removed by legalization!");
1669         }
1670 
1671         Opcode::SymbolValue => {
1672             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1673             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
1674             let extname = extname.clone();
1675             ctx.emit(Inst::LoadExtName {
1676                 rd,
1677                 name: Box::new(extname),
1678                 offset,
1679             });
1680         }
1681 
1682         Opcode::Call | Opcode::CallIndirect => {
1683             let caller_conv = ctx.abi().call_conv();
1684             let (mut abi, inputs) = match op {
1685                 Opcode::Call => {
1686                     let (extname, dist) = ctx.call_target(insn).unwrap();
1687                     let extname = extname.clone();
1688                     let sig = ctx.call_sig(insn).unwrap();
1689                     assert!(inputs.len() == sig.params.len());
1690                     assert!(outputs.len() == sig.returns.len());
1691                     (
1692                         AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
1693                         &inputs[..],
1694                     )
1695                 }
1696                 Opcode::CallIndirect => {
1697                     let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1698                     let sig = ctx.call_sig(insn).unwrap();
1699                     assert!(inputs.len() - 1 == sig.params.len());
1700                     assert!(outputs.len() == sig.returns.len());
1701                     (
1702                         AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
1703                         &inputs[1..],
1704                     )
1705                 }
1706                 _ => unreachable!(),
1707             };
1708 
1709             abi.emit_stack_pre_adjust(ctx);
1710             assert!(inputs.len() == abi.num_args());
1711             for i in abi.get_copy_to_arg_order() {
1712                 let input = inputs[i];
1713                 let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
1714                 abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
1715             }
1716             abi.emit_call(ctx);
1717             for (i, output) in outputs.iter().enumerate() {
1718                 let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
1719                 abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
1720             }
1721             abi.emit_stack_post_adjust(ctx);
1722         }
1723 
1724         Opcode::GetPinnedReg => {
1725             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1726             ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
1727         }
1728 
1729         Opcode::SetPinnedReg => {
1730             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1731             ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
1732         }
1733 
1734         Opcode::Spill
1735         | Opcode::Fill
1736         | Opcode::FillNop
1737         | Opcode::Regmove
1738         | Opcode::CopySpecial
1739         | Opcode::CopyToSsa
1740         | Opcode::CopyNop
1741         | Opcode::AdjustSpDown
1742         | Opcode::AdjustSpUpImm
1743         | Opcode::AdjustSpDownImm
1744         | Opcode::IfcmpSp
1745         | Opcode::Regspill
1746         | Opcode::Regfill => {
1747             panic!("Unused opcode should not be encountered.");
1748         }
1749 
1750         Opcode::Jump
1751         | Opcode::Fallthrough
1752         | Opcode::Brz
1753         | Opcode::Brnz
1754         | Opcode::BrIcmp
1755         | Opcode::Brif
1756         | Opcode::Brff
1757         | Opcode::IndirectJumpTableBr
1758         | Opcode::BrTable => {
1759             panic!("Branch opcode reached non-branch lowering logic!");
1760         }
1761 
1762         Opcode::Vconst => {
1763             let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
1764             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1765             lower_constant_f128(ctx, rd, value);
1766         }
1767 
1768         Opcode::RawBitcast => {
1769             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1770             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1771             let ty = ctx.input_ty(insn, 0);
1772             ctx.emit(Inst::gen_move(rd, rm, ty));
1773         }
1774 
1775         Opcode::Extractlane => {
1776             if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
1777                 let idx = *imm;
1778                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1779                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1780                 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
1781                 let ty = ty.unwrap();
1782 
1783                 if ty_has_int_representation(ty) {
1784                     ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
1785                 // Plain moves are faster on some processors.
1786                 } else if idx == 0 {
1787                     ctx.emit(Inst::gen_move(rd, rn, ty));
1788                 } else {
1789                     ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
1790                 }
1791             } else {
1792                 unreachable!();
1793             }
1794         }
1795 
1796         Opcode::Insertlane => {
1797             let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
1798                 *imm
1799             } else {
1800                 unreachable!();
1801             };
1802             let input_ty = ctx.input_ty(insn, 1);
1803             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1804             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1805             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1806             let ty = ty.unwrap();
1807             let size = VectorSize::from_ty(ty);
1808 
1809             ctx.emit(Inst::gen_move(rd, rm, ty));
1810 
1811             if ty_has_int_representation(input_ty) {
1812                 ctx.emit(Inst::MovToVec { rd, rn, idx, size });
1813             } else {
1814                 ctx.emit(Inst::VecMovElement {
1815                     rd,
1816                     rn,
1817                     dest_idx: idx,
1818                     src_idx: 0,
1819                     size,
1820                 });
1821             }
1822         }
1823 
1824         Opcode::Splat => {
1825             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1826             let size = VectorSize::from_ty(ty.unwrap());
1827 
1828             if let Some((_, insn)) = maybe_input_insn_multi(
1829                 ctx,
1830                 inputs[0],
1831                 &[
1832                     Opcode::Bconst,
1833                     Opcode::F32const,
1834                     Opcode::F64const,
1835                     Opcode::Iconst,
1836                 ],
1837             ) {
1838                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1839             } else if let Some(insn) =
1840                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
1841             {
1842                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1843             } else if let Some(insn) =
1844                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
1845             {
1846                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1847             } else if let Some((_, insn)) = maybe_input_insn_multi(
1848                 ctx,
1849                 inputs[0],
1850                 &[
1851                     Opcode::Uload8,
1852                     Opcode::Sload8,
1853                     Opcode::Uload16,
1854                     Opcode::Sload16,
1855                     Opcode::Uload32,
1856                     Opcode::Sload32,
1857                     Opcode::Load,
1858                 ],
1859             ) {
1860                 ctx.sink_inst(insn);
1861                 let load_inputs = insn_inputs(ctx, insn);
1862                 let load_outputs = insn_outputs(ctx, insn);
1863                 lower_load(
1864                     ctx,
1865                     insn,
1866                     &load_inputs[..],
1867                     load_outputs[0],
1868                     |ctx, _rd, _elem_ty, mem| {
1869                         let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1870                         let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
1871                         if let Some(addr_inst) = addr_inst {
1872                             ctx.emit(addr_inst);
1873                         }
1874                         ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
1875                     },
1876                 );
1877             } else {
1878                 let input_ty = ctx.input_ty(insn, 0);
1879                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1880                 let inst = if ty_has_int_representation(input_ty) {
1881                     Inst::VecDup { rd, rn, size }
1882                 } else {
1883                     Inst::VecDupFromFpu { rd, rn, size }
1884                 };
1885 
1886                 ctx.emit(inst);
1887             }
1888         }
1889 
1890         Opcode::ScalarToVector => {
1891             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1892             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1893             let input_ty = ctx.input_ty(insn, 0);
1894             if (input_ty == I32 && ty.unwrap() == I32X4)
1895                 || (input_ty == I64 && ty.unwrap() == I64X2)
1896             {
1897                 ctx.emit(Inst::MovToFpu {
1898                     rd,
1899                     rn,
1900                     size: ScalarSize::from_ty(input_ty),
1901                 });
1902             } else {
1903                 return Err(CodegenError::Unsupported(format!(
1904                     "ScalarToVector: unsupported types {:?} -> {:?}",
1905                     input_ty, ty
1906                 )));
1907             }
1908         }
1909 
1910         Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
1911             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1912             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1913             let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
1914 
1915             // cmeq vtmp.2d, vm.2d, #0
1916             // addp dtmp, vtmp.2d
1917             // fcmp dtmp, dtmp
1918             // cset xd, eq
1919             //
1920             // Note that after the ADDP the value of the temporary register will
1921             // be either 0 when all input elements are true, i.e. non-zero, or a
1922             // NaN otherwise (either -1 or -2 when represented as an integer);
1923             // NaNs are the only floating-point numbers that compare unequal to
1924             // themselves.
1925 
1926             ctx.emit(Inst::VecMisc {
1927                 op: VecMisc2::Cmeq0,
1928                 rd: tmp,
1929                 rn: rm,
1930                 size: VectorSize::Size64x2,
1931             });
1932             ctx.emit(Inst::VecRRPair {
1933                 op: VecPairOp::Addp,
1934                 rd: tmp,
1935                 rn: tmp.to_reg(),
1936             });
1937             ctx.emit(Inst::FpuCmp64 {
1938                 rn: tmp.to_reg(),
1939                 rm: tmp.to_reg(),
1940             });
1941             materialize_bool_result(ctx, insn, rd, Cond::Eq);
1942         }
1943 
1944         Opcode::VanyTrue | Opcode::VallTrue => {
1945             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1946             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1947             let src_ty = ctx.input_ty(insn, 0);
1948             let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
1949 
1950             // This operation is implemented by using umaxp or uminv to
1951             // create a scalar value, which is then compared against zero.
1952             //
1953             // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
1954             // mov xm, vn.d[0]
1955             // cmp xm, #0
1956             // cset xm, ne
1957 
1958             let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
1959 
1960             if op == Opcode::VanyTrue {
1961                 ctx.emit(Inst::VecRRR {
1962                     alu_op: VecALUOp::Umaxp,
1963                     rd: tmp,
1964                     rn: rm,
1965                     rm: rm,
1966                     size,
1967                 });
1968             } else {
1969                 ctx.emit(Inst::VecLanes {
1970                     op: VecLanesOp::Uminv,
1971                     rd: tmp,
1972                     rn: rm,
1973                     size,
1974                 });
1975             };
1976 
1977             ctx.emit(Inst::MovFromVec {
1978                 rd,
1979                 rn: tmp.to_reg(),
1980                 idx: 0,
1981                 size: VectorSize::Size64x2,
1982             });
1983 
1984             ctx.emit(Inst::AluRRImm12 {
1985                 alu_op: ALUOp::SubS64,
1986                 rd: writable_zero_reg(),
1987                 rn: rd.to_reg(),
1988                 imm12: Imm12::zero(),
1989             });
1990 
1991             materialize_bool_result(ctx, insn, rd, Cond::Ne);
1992         }
1993 
1994         Opcode::VhighBits => {
1995             let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1996             let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1997             let ty = ctx.input_ty(insn, 0);
1998             // All three sequences use one integer temporary and two vector temporaries.  The
1999             // shift is done early so as to give the register allocator the possibility of using
2000             // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
2001             // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
2002             // derivation of these sequences.  Alternative sequences are discussed in
2003             // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
2004             // used here.
2005             let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
2006             let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2007             let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2008             match ty {
2009                 I8X16 => {
2010                     // sshr  tmp_v1.16b, src_v.16b, #7
2011                     // mov   tmp_r0, #0x0201
2012                     // movk  tmp_r0, #0x0804, lsl 16
2013                     // movk  tmp_r0, #0x2010, lsl 32
2014                     // movk  tmp_r0, #0x8040, lsl 48
2015                     // dup   tmp_v0.2d, tmp_r0
2016                     // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
2017                     // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
2018                     // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2019                     // addv  tmp_v0h, tmp_v0.8h
2020                     // mov   dst_r, tmp_v0.h[0]
2021                     ctx.emit(Inst::VecShiftImm {
2022                         op: VecShiftImmOp::Sshr,
2023                         rd: tmp_v1,
2024                         rn: src_v,
2025                         size: VectorSize::Size8x16,
2026                         imm: 7,
2027                     });
2028                     lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
2029                     ctx.emit(Inst::VecRRR {
2030                         alu_op: VecALUOp::And,
2031                         rd: tmp_v1,
2032                         rn: tmp_v1.to_reg(),
2033                         rm: tmp_v0.to_reg(),
2034                         size: VectorSize::Size8x16,
2035                     });
2036                     ctx.emit(Inst::VecExtract {
2037                         rd: tmp_v0,
2038                         rn: tmp_v1.to_reg(),
2039                         rm: tmp_v1.to_reg(),
2040                         imm4: 8,
2041                     });
2042                     ctx.emit(Inst::VecRRR {
2043                         alu_op: VecALUOp::Zip1,
2044                         rd: tmp_v0,
2045                         rn: tmp_v1.to_reg(),
2046                         rm: tmp_v0.to_reg(),
2047                         size: VectorSize::Size8x16,
2048                     });
2049                     ctx.emit(Inst::VecLanes {
2050                         op: VecLanesOp::Addv,
2051                         rd: tmp_v0,
2052                         rn: tmp_v0.to_reg(),
2053                         size: VectorSize::Size16x8,
2054                     });
2055                     ctx.emit(Inst::MovFromVec {
2056                         rd: dst_r,
2057                         rn: tmp_v0.to_reg(),
2058                         idx: 0,
2059                         size: VectorSize::Size16x8,
2060                     });
2061                 }
2062                 I16X8 => {
2063                     // sshr  tmp_v1.8h, src_v.8h, #15
2064                     // mov   tmp_r0, #0x1
2065                     // movk  tmp_r0, #0x2, lsl 16
2066                     // movk  tmp_r0, #0x4, lsl 32
2067                     // movk  tmp_r0, #0x8, lsl 48
2068                     // dup   tmp_v0.2d, tmp_r0
2069                     // shl   tmp_r0, tmp_r0, #4
2070                     // mov   tmp_v0.d[1], tmp_r0
2071                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2072                     // addv  tmp_v0h, tmp_v0.8h
2073                     // mov   dst_r, tmp_v0.h[0]
2074                     ctx.emit(Inst::VecShiftImm {
2075                         op: VecShiftImmOp::Sshr,
2076                         rd: tmp_v1,
2077                         rn: src_v,
2078                         size: VectorSize::Size16x8,
2079                         imm: 15,
2080                     });
2081                     lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
2082                     ctx.emit(Inst::VecDup {
2083                         rd: tmp_v0,
2084                         rn: tmp_r0.to_reg(),
2085                         size: VectorSize::Size64x2,
2086                     });
2087                     ctx.emit(Inst::AluRRImmShift {
2088                         alu_op: ALUOp::Lsl64,
2089                         rd: tmp_r0,
2090                         rn: tmp_r0.to_reg(),
2091                         immshift: ImmShift { imm: 4 },
2092                     });
2093                     ctx.emit(Inst::MovToVec {
2094                         rd: tmp_v0,
2095                         rn: tmp_r0.to_reg(),
2096                         idx: 1,
2097                         size: VectorSize::Size64x2,
2098                     });
2099                     ctx.emit(Inst::VecRRR {
2100                         alu_op: VecALUOp::And,
2101                         rd: tmp_v0,
2102                         rn: tmp_v1.to_reg(),
2103                         rm: tmp_v0.to_reg(),
2104                         size: VectorSize::Size8x16,
2105                     });
2106                     ctx.emit(Inst::VecLanes {
2107                         op: VecLanesOp::Addv,
2108                         rd: tmp_v0,
2109                         rn: tmp_v0.to_reg(),
2110                         size: VectorSize::Size16x8,
2111                     });
2112                     ctx.emit(Inst::MovFromVec {
2113                         rd: dst_r,
2114                         rn: tmp_v0.to_reg(),
2115                         idx: 0,
2116                         size: VectorSize::Size16x8,
2117                     });
2118                 }
2119                 I32X4 => {
2120                     // sshr  tmp_v1.4s, src_v.4s, #31
2121                     // mov   tmp_r0, #0x1
2122                     // movk  tmp_r0, #0x2, lsl 32
2123                     // dup   tmp_v0.2d, tmp_r0
2124                     // shl   tmp_r0, tmp_r0, #2
2125                     // mov   tmp_v0.d[1], tmp_r0
2126                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2127                     // addv  tmp_v0s, tmp_v0.4s
2128                     // mov   dst_r, tmp_v0.s[0]
2129                     ctx.emit(Inst::VecShiftImm {
2130                         op: VecShiftImmOp::Sshr,
2131                         rd: tmp_v1,
2132                         rn: src_v,
2133                         size: VectorSize::Size32x4,
2134                         imm: 31,
2135                     });
2136                     lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
2137                     ctx.emit(Inst::VecDup {
2138                         rd: tmp_v0,
2139                         rn: tmp_r0.to_reg(),
2140                         size: VectorSize::Size64x2,
2141                     });
2142                     ctx.emit(Inst::AluRRImmShift {
2143                         alu_op: ALUOp::Lsl64,
2144                         rd: tmp_r0,
2145                         rn: tmp_r0.to_reg(),
2146                         immshift: ImmShift { imm: 2 },
2147                     });
2148                     ctx.emit(Inst::MovToVec {
2149                         rd: tmp_v0,
2150                         rn: tmp_r0.to_reg(),
2151                         idx: 1,
2152                         size: VectorSize::Size64x2,
2153                     });
2154                     ctx.emit(Inst::VecRRR {
2155                         alu_op: VecALUOp::And,
2156                         rd: tmp_v0,
2157                         rn: tmp_v1.to_reg(),
2158                         rm: tmp_v0.to_reg(),
2159                         size: VectorSize::Size8x16,
2160                     });
2161                     ctx.emit(Inst::VecLanes {
2162                         op: VecLanesOp::Addv,
2163                         rd: tmp_v0,
2164                         rn: tmp_v0.to_reg(),
2165                         size: VectorSize::Size32x4,
2166                     });
2167                     ctx.emit(Inst::MovFromVec {
2168                         rd: dst_r,
2169                         rn: tmp_v0.to_reg(),
2170                         idx: 0,
2171                         size: VectorSize::Size32x4,
2172                     });
2173                 }
2174                 I64X2 => {
2175                     // mov dst_r, src_v.d[0]
2176                     // mov tmp_r0, src_v.d[1]
2177                     // lsr dst_r, dst_r, #63
2178                     // lsr tmp_r0, tmp_r0, #63
2179                     // add dst_r, dst_r, tmp_r0, lsl #1
2180                     ctx.emit(Inst::MovFromVec {
2181                         rd: dst_r,
2182                         rn: src_v,
2183                         idx: 0,
2184                         size: VectorSize::Size64x2,
2185                     });
2186                     ctx.emit(Inst::MovFromVec {
2187                         rd: tmp_r0,
2188                         rn: src_v,
2189                         idx: 1,
2190                         size: VectorSize::Size64x2,
2191                     });
2192                     ctx.emit(Inst::AluRRImmShift {
2193                         alu_op: ALUOp::Lsr64,
2194                         rd: dst_r,
2195                         rn: dst_r.to_reg(),
2196                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
2197                     });
2198                     ctx.emit(Inst::AluRRImmShift {
2199                         alu_op: ALUOp::Lsr64,
2200                         rd: tmp_r0,
2201                         rn: tmp_r0.to_reg(),
2202                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
2203                     });
2204                     ctx.emit(Inst::AluRRRShift {
2205                         alu_op: ALUOp::Add32,
2206                         rd: dst_r,
2207                         rn: dst_r.to_reg(),
2208                         rm: tmp_r0.to_reg(),
2209                         shiftop: ShiftOpAndAmt::new(
2210                             ShiftOp::LSL,
2211                             ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
2212                         ),
2213                     });
2214                 }
2215                 _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
2216             }
2217         }
2218 
2219         Opcode::Shuffle => {
2220             let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
2221             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2222             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2223             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2224             // 2 register table vector lookups require consecutive table registers;
2225             // we satisfy this constraint by hardcoding the usage of v29 and v30.
2226             let temp = writable_vreg(29);
2227             let temp2 = writable_vreg(30);
2228             let input_ty = ctx.input_ty(insn, 0);
2229             assert_eq!(input_ty, ctx.input_ty(insn, 1));
2230             // Make sure that both inputs are in virtual registers, since it is
2231             // not guaranteed that we can get them safely to the temporaries if
2232             // either is in a real register.
2233             let rn = ctx.ensure_in_vreg(rn, input_ty);
2234             let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
2235 
2236             lower_constant_f128(ctx, rd, mask);
2237             ctx.emit(Inst::gen_move(temp, rn, input_ty));
2238             ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
2239             ctx.emit(Inst::VecTbl2 {
2240                 rd,
2241                 rn: temp.to_reg(),
2242                 rn2: temp2.to_reg(),
2243                 rm: rd.to_reg(),
2244                 is_extension: false,
2245             });
2246         }
2247 
2248         Opcode::Swizzle => {
2249             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2250             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2251             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2252 
2253             ctx.emit(Inst::VecTbl {
2254                 rd,
2255                 rn,
2256                 rm,
2257                 is_extension: false,
2258             });
2259         }
2260 
2261         Opcode::Vsplit | Opcode::Vconcat => {
2262             // TODO
2263             panic!("Vector ops not implemented.");
2264         }
2265 
2266         Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
2267 
2268         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
2269             let alu_op = match op {
2270                 Opcode::Umin => VecALUOp::Umin,
2271                 Opcode::Imin => VecALUOp::Smin,
2272                 Opcode::Umax => VecALUOp::Umax,
2273                 Opcode::Imax => VecALUOp::Smax,
2274                 _ => unreachable!(),
2275             };
2276             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2277             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2278             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2279             let ty = ty.unwrap();
2280             ctx.emit(Inst::VecRRR {
2281                 alu_op,
2282                 rd,
2283                 rn,
2284                 rm,
2285                 size: VectorSize::from_ty(ty),
2286             });
2287         }
2288 
2289         Opcode::WideningPairwiseDotProductS => {
2290             let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2291             let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2292             let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2293             let ty = ty.unwrap();
2294             if ty == I32X4 {
2295                 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2296                 // The args have type I16X8.
2297                 // "y = i32x4.dot_i16x8_s(a, b)"
2298                 // => smull  tmp, a, b
2299                 //    smull2 y,   a, b
2300                 //    addp   y,   tmp, y
2301                 ctx.emit(Inst::VecRRR {
2302                     alu_op: VecALUOp::Smull,
2303                     rd: tmp,
2304                     rn: r_a,
2305                     rm: r_b,
2306                     size: VectorSize::Size16x8,
2307                 });
2308                 ctx.emit(Inst::VecRRR {
2309                     alu_op: VecALUOp::Smull2,
2310                     rd: r_y,
2311                     rn: r_a,
2312                     rm: r_b,
2313                     size: VectorSize::Size16x8,
2314                 });
2315                 ctx.emit(Inst::VecRRR {
2316                     alu_op: VecALUOp::Addp,
2317                     rd: r_y,
2318                     rn: tmp.to_reg(),
2319                     rm: r_y.to_reg(),
2320                     size: VectorSize::Size32x4,
2321                 });
2322             } else {
2323                 return Err(CodegenError::Unsupported(format!(
2324                     "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
2325                     ty
2326                 )));
2327             }
2328         }
2329 
2330         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
2331             let ty = ty.unwrap();
2332             let bits = ty_bits(ty);
2333             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2334             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2335             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2336             if !ty.is_vector() {
2337                 let fpu_op = match (op, bits) {
2338                     (Opcode::Fadd, 32) => FPUOp2::Add32,
2339                     (Opcode::Fadd, 64) => FPUOp2::Add64,
2340                     (Opcode::Fsub, 32) => FPUOp2::Sub32,
2341                     (Opcode::Fsub, 64) => FPUOp2::Sub64,
2342                     (Opcode::Fmul, 32) => FPUOp2::Mul32,
2343                     (Opcode::Fmul, 64) => FPUOp2::Mul64,
2344                     (Opcode::Fdiv, 32) => FPUOp2::Div32,
2345                     (Opcode::Fdiv, 64) => FPUOp2::Div64,
2346                     (Opcode::Fmin, 32) => FPUOp2::Min32,
2347                     (Opcode::Fmin, 64) => FPUOp2::Min64,
2348                     (Opcode::Fmax, 32) => FPUOp2::Max32,
2349                     (Opcode::Fmax, 64) => FPUOp2::Max64,
2350                     _ => panic!("Unknown op/bits combination"),
2351                 };
2352                 ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
2353             } else {
2354                 let alu_op = match op {
2355                     Opcode::Fadd => VecALUOp::Fadd,
2356                     Opcode::Fsub => VecALUOp::Fsub,
2357                     Opcode::Fdiv => VecALUOp::Fdiv,
2358                     Opcode::Fmax => VecALUOp::Fmax,
2359                     Opcode::Fmin => VecALUOp::Fmin,
2360                     Opcode::Fmul => VecALUOp::Fmul,
2361                     _ => unreachable!(),
2362                 };
2363 
2364                 ctx.emit(Inst::VecRRR {
2365                     rd,
2366                     rn,
2367                     rm,
2368                     alu_op,
2369                     size: VectorSize::from_ty(ty),
2370                 });
2371             }
2372         }
2373 
2374         Opcode::FminPseudo | Opcode::FmaxPseudo => {
2375             let ty = ctx.input_ty(insn, 0);
2376             if ty == F32X4 || ty == F64X2 {
2377                 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
2378                 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
2379                 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2380                 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2381                 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2382                 // Since we're going to write the output register `r_dst` anyway, we might as
2383                 // well first use it to hold the comparison result.  This has the slightly unusual
2384                 // effect that we modify the output register in the first instruction (`fcmgt`)
2385                 // but read both the inputs again in the second instruction (`bsl`), which means
2386                 // that the output register can't be either of the input registers.  Regalloc
2387                 // should handle this correctly, nevertheless.
2388                 ctx.emit(Inst::VecRRR {
2389                     alu_op: VecALUOp::Fcmgt,
2390                     rd: r_dst,
2391                     rn: if op == Opcode::FminPseudo { r_a } else { r_b },
2392                     rm: if op == Opcode::FminPseudo { r_b } else { r_a },
2393                     size: if ty == F32X4 {
2394                         VectorSize::Size32x4
2395                     } else {
2396                         VectorSize::Size64x2
2397                     },
2398                 });
2399                 ctx.emit(Inst::VecRRR {
2400                     alu_op: VecALUOp::Bsl,
2401                     rd: r_dst,
2402                     rn: r_b,
2403                     rm: r_a,
2404                     size: VectorSize::Size8x16,
2405                 });
2406             } else {
2407                 panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
2408             }
2409         }
2410 
2411         Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
2412             let ty = ty.unwrap();
2413             let bits = ty_bits(ty);
2414             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2415             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2416             if !ty.is_vector() {
2417                 let fpu_op = match (op, bits) {
2418                     (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
2419                     (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
2420                     (Opcode::Fneg, 32) => FPUOp1::Neg32,
2421                     (Opcode::Fneg, 64) => FPUOp1::Neg64,
2422                     (Opcode::Fabs, 32) => FPUOp1::Abs32,
2423                     (Opcode::Fabs, 64) => FPUOp1::Abs64,
2424                     (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
2425                     (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
2426                     (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
2427                     (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
2428                     _ => panic!("Unknown op/bits combination"),
2429                 };
2430                 ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
2431             } else {
2432                 let op = match op {
2433                     Opcode::Fabs => VecMisc2::Fabs,
2434                     Opcode::Fneg => VecMisc2::Fneg,
2435                     Opcode::Sqrt => VecMisc2::Fsqrt,
2436                     _ => unimplemented!(),
2437                 };
2438 
2439                 ctx.emit(Inst::VecMisc {
2440                     op,
2441                     rd,
2442                     rn,
2443                     size: VectorSize::from_ty(ty),
2444                 });
2445             }
2446         }
2447 
2448         Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
2449             let ty = ctx.output_ty(insn, 0);
2450             if !ty.is_vector() {
2451                 let bits = ty_bits(ty);
2452                 let op = match (op, bits) {
2453                     (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2454                     (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2455                     (Opcode::Floor, 32) => FpuRoundMode::Minus32,
2456                     (Opcode::Floor, 64) => FpuRoundMode::Minus64,
2457                     (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2458                     (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2459                     (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2460                     (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2461                     _ => panic!("Unknown op/bits combination (scalar)"),
2462                 };
2463                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2464                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2465                 ctx.emit(Inst::FpuRound { op, rd, rn });
2466             } else {
2467                 let (op, size) = match (op, ty) {
2468                     (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
2469                     (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
2470                     (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
2471                     (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
2472                     (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
2473                     (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
2474                     (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
2475                     (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
2476                     _ => panic!("Unknown op/ty combination (vector){:?}", ty),
2477                 };
2478                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2479                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2480                 ctx.emit(Inst::VecMisc { op, rd, rn, size });
2481             }
2482         }
2483 
2484         Opcode::Fma => {
2485             let bits = ty_bits(ctx.output_ty(insn, 0));
2486             let fpu_op = match bits {
2487                 32 => FPUOp3::MAdd32,
2488                 64 => FPUOp3::MAdd64,
2489                 _ => panic!("Unknown op size"),
2490             };
2491             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2492             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2493             let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
2494             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2495             ctx.emit(Inst::FpuRRRR {
2496                 fpu_op,
2497                 rn,
2498                 rm,
2499                 ra,
2500                 rd,
2501             });
2502         }
2503 
2504         Opcode::Fcopysign => {
2505             // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
2506             //
2507             // This is a scalar Fcopysign.
2508             // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
2509             // In the latter case it still sets all bits except the lowest 32 to 0.
2510             //
2511             //  mov vd, vn
2512             //  ushr vtmp, vm, #63 / #31
2513             //  sli vd, vtmp, #63 / #31
2514 
2515             let ty = ctx.output_ty(insn, 0);
2516             let bits = ty_bits(ty) as u8;
2517             assert!(bits == 32 || bits == 64);
2518             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2519             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2520             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2521             let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
2522 
2523             // Copy LHS to rd.
2524             ctx.emit(Inst::gen_move(rd, rn, ty));
2525 
2526             // Copy the sign bit to the lowest bit in tmp.
2527             let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2528             ctx.emit(Inst::FpuRRI {
2529                 fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
2530                 rd: tmp,
2531                 rn: rm,
2532             });
2533 
2534             // Insert the bit from tmp into the sign bit of rd.
2535             let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2536             ctx.emit(Inst::FpuRRI {
2537                 fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
2538                 rd,
2539                 rn: tmp.to_reg(),
2540             });
2541         }
2542 
2543         Opcode::FcvtToUint | Opcode::FcvtToSint => {
2544             let in_bits = ty_bits(ctx.input_ty(insn, 0));
2545             let out_bits = ty_bits(ctx.output_ty(insn, 0));
2546             let signed = op == Opcode::FcvtToSint;
2547             let op = match (signed, in_bits, out_bits) {
2548                 (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
2549                 (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
2550                 (false, 32, 64) => FpuToIntOp::F32ToU64,
2551                 (true, 32, 64) => FpuToIntOp::F32ToI64,
2552                 (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
2553                 (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
2554                 (false, 64, 64) => FpuToIntOp::F64ToU64,
2555                 (true, 64, 64) => FpuToIntOp::F64ToI64,
2556                 _ => panic!("Unknown input/output-bits combination"),
2557             };
2558 
2559             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2560             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2561 
2562             // First, check the output: it's important to carry the NaN conversion before the
2563             // in-bounds conversion, per wasm semantics.
2564 
2565             // Check that the input is not a NaN.
2566             if in_bits == 32 {
2567                 ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
2568             } else {
2569                 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
2570             }
2571             let trap_code = TrapCode::BadConversionToInteger;
2572             ctx.emit(Inst::TrapIf {
2573                 trap_code,
2574                 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
2575             });
2576 
2577             let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2578 
2579             // Check that the input is in range, with "truncate towards zero" semantics. This means
2580             // we allow values that are slightly out of range:
2581             // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
2582             // can be represented), and strictly less than INT_MAX+1 (when this can be
2583             // represented).
2584             // - for unsigned conversions, we allow values strictly greater than -1, and strictly
2585             // less than UINT_MAX+1 (when this can be represented).
2586 
2587             if in_bits == 32 {
2588                 // From float32.
2589                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
2590                     (true, 8) => (
2591                         i8::min_value() as f32 - 1.,
2592                         FloatCC::GreaterThan,
2593                         i8::max_value() as f32 + 1.,
2594                     ),
2595                     (true, 16) => (
2596                         i16::min_value() as f32 - 1.,
2597                         FloatCC::GreaterThan,
2598                         i16::max_value() as f32 + 1.,
2599                     ),
2600                     (true, 32) => (
2601                         i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
2602                         FloatCC::GreaterThanOrEqual,
2603                         i32::max_value() as f32 + 1.,
2604                     ),
2605                     (true, 64) => (
2606                         i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
2607                         FloatCC::GreaterThanOrEqual,
2608                         i64::max_value() as f32 + 1.,
2609                     ),
2610                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
2611                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
2612                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
2613                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
2614                     _ => panic!("Unknown input/output-bits combination"),
2615                 };
2616 
2617                 // >= low_bound
2618                 lower_constant_f32(ctx, tmp, low_bound);
2619                 ctx.emit(Inst::FpuCmp32 {
2620                     rn,
2621                     rm: tmp.to_reg(),
2622                 });
2623                 let trap_code = TrapCode::IntegerOverflow;
2624                 ctx.emit(Inst::TrapIf {
2625                     trap_code,
2626                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
2627                 });
2628 
2629                 // <= high_bound
2630                 lower_constant_f32(ctx, tmp, high_bound);
2631                 ctx.emit(Inst::FpuCmp32 {
2632                     rn,
2633                     rm: tmp.to_reg(),
2634                 });
2635                 let trap_code = TrapCode::IntegerOverflow;
2636                 ctx.emit(Inst::TrapIf {
2637                     trap_code,
2638                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
2639                 });
2640             } else {
2641                 // From float64.
2642                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
2643                     (true, 8) => (
2644                         i8::min_value() as f64 - 1.,
2645                         FloatCC::GreaterThan,
2646                         i8::max_value() as f64 + 1.,
2647                     ),
2648                     (true, 16) => (
2649                         i16::min_value() as f64 - 1.,
2650                         FloatCC::GreaterThan,
2651                         i16::max_value() as f64 + 1.,
2652                     ),
2653                     (true, 32) => (
2654                         i32::min_value() as f64 - 1.,
2655                         FloatCC::GreaterThan,
2656                         i32::max_value() as f64 + 1.,
2657                     ),
2658                     (true, 64) => (
2659                         i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
2660                         FloatCC::GreaterThanOrEqual,
2661                         i64::max_value() as f64 + 1.,
2662                     ),
2663                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
2664                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
2665                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
2666                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
2667                     _ => panic!("Unknown input/output-bits combination"),
2668                 };
2669 
2670                 // >= low_bound
2671                 lower_constant_f64(ctx, tmp, low_bound);
2672                 ctx.emit(Inst::FpuCmp64 {
2673                     rn,
2674                     rm: tmp.to_reg(),
2675                 });
2676                 let trap_code = TrapCode::IntegerOverflow;
2677                 ctx.emit(Inst::TrapIf {
2678                     trap_code,
2679                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
2680                 });
2681 
2682                 // <= high_bound
2683                 lower_constant_f64(ctx, tmp, high_bound);
2684                 ctx.emit(Inst::FpuCmp64 {
2685                     rn,
2686                     rm: tmp.to_reg(),
2687                 });
2688                 let trap_code = TrapCode::IntegerOverflow;
2689                 ctx.emit(Inst::TrapIf {
2690                     trap_code,
2691                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
2692                 });
2693             };
2694 
2695             // Do the conversion.
2696             ctx.emit(Inst::FpuToInt { op, rd, rn });
2697         }
2698 
2699         Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
2700             let ty = ty.unwrap();
2701             let signed = op == Opcode::FcvtFromSint;
2702             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2703 
2704             if ty.is_vector() {
2705                 let op = if signed {
2706                     VecMisc2::Scvtf
2707                 } else {
2708                     VecMisc2::Ucvtf
2709                 };
2710                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2711 
2712                 ctx.emit(Inst::VecMisc {
2713                     op,
2714                     rd,
2715                     rn,
2716                     size: VectorSize::from_ty(ty),
2717                 });
2718             } else {
2719                 let in_bits = ty_bits(ctx.input_ty(insn, 0));
2720                 let out_bits = ty_bits(ty);
2721                 let op = match (signed, in_bits, out_bits) {
2722                     (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
2723                     (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
2724                     (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
2725                     (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
2726                     (false, 64, 32) => IntToFpuOp::U64ToF32,
2727                     (true, 64, 32) => IntToFpuOp::I64ToF32,
2728                     (false, 64, 64) => IntToFpuOp::U64ToF64,
2729                     (true, 64, 64) => IntToFpuOp::I64ToF64,
2730                     _ => panic!("Unknown input/output-bits combination"),
2731                 };
2732                 let narrow_mode = match (signed, in_bits) {
2733                     (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
2734                     (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
2735                     (false, 64) => NarrowValueMode::ZeroExtend64,
2736                     (true, 64) => NarrowValueMode::SignExtend64,
2737                     _ => panic!("Unknown input size"),
2738                 };
2739                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
2740                 ctx.emit(Inst::IntToFpu { op, rd, rn });
2741             }
2742         }
2743 
2744         Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
2745             let ty = ty.unwrap();
2746             let out_signed = op == Opcode::FcvtToSintSat;
2747             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2748             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2749 
2750             if ty.is_vector() {
2751                 let op = if out_signed {
2752                     VecMisc2::Fcvtzs
2753                 } else {
2754                     VecMisc2::Fcvtzu
2755                 };
2756 
2757                 ctx.emit(Inst::VecMisc {
2758                     op,
2759                     rd,
2760                     rn,
2761                     size: VectorSize::from_ty(ty),
2762                 });
2763             } else {
2764                 let in_ty = ctx.input_ty(insn, 0);
2765                 let in_bits = ty_bits(in_ty);
2766                 let out_bits = ty_bits(ty);
2767                 // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
2768                 // FMIN Vtmp2, Vin, Vtmp1
2769                 // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
2770                 // FMAX Vtmp2, Vtmp2, Vtmp1
2771                 // (if signed) FIMM Vtmp1, 0
2772                 // FCMP Vin, Vin
2773                 // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
2774                 // convert Rout, Vtmp2
2775 
2776                 assert!(in_bits == 32 || in_bits == 64);
2777                 assert!(out_bits == 32 || out_bits == 64);
2778 
2779                 let min: f64 = match (out_bits, out_signed) {
2780                     (32, true) => std::i32::MIN as f64,
2781                     (32, false) => 0.0,
2782                     (64, true) => std::i64::MIN as f64,
2783                     (64, false) => 0.0,
2784                     _ => unreachable!(),
2785                 };
2786 
2787                 let max = match (out_bits, out_signed) {
2788                     (32, true) => std::i32::MAX as f64,
2789                     (32, false) => std::u32::MAX as f64,
2790                     (64, true) => std::i64::MAX as f64,
2791                     (64, false) => std::u64::MAX as f64,
2792                     _ => unreachable!(),
2793                 };
2794 
2795                 let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
2796                 let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
2797 
2798                 if in_bits == 32 {
2799                     lower_constant_f32(ctx, rtmp1, max as f32);
2800                 } else {
2801                     lower_constant_f64(ctx, rtmp1, max);
2802                 }
2803                 ctx.emit(Inst::FpuRRR {
2804                     fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
2805                     rd: rtmp2,
2806                     rn: rn,
2807                     rm: rtmp1.to_reg(),
2808                 });
2809                 if in_bits == 32 {
2810                     lower_constant_f32(ctx, rtmp1, min as f32);
2811                 } else {
2812                     lower_constant_f64(ctx, rtmp1, min);
2813                 }
2814                 ctx.emit(Inst::FpuRRR {
2815                     fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
2816                     rd: rtmp2,
2817                     rn: rtmp2.to_reg(),
2818                     rm: rtmp1.to_reg(),
2819                 });
2820                 if out_signed {
2821                     if in_bits == 32 {
2822                         lower_constant_f32(ctx, rtmp1, 0.0);
2823                     } else {
2824                         lower_constant_f64(ctx, rtmp1, 0.0);
2825                     }
2826                 }
2827                 if in_bits == 32 {
2828                     ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
2829                     ctx.emit(Inst::FpuCSel32 {
2830                         rd: rtmp2,
2831                         rn: rtmp1.to_reg(),
2832                         rm: rtmp2.to_reg(),
2833                         cond: Cond::Ne,
2834                     });
2835                 } else {
2836                     ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
2837                     ctx.emit(Inst::FpuCSel64 {
2838                         rd: rtmp2,
2839                         rn: rtmp1.to_reg(),
2840                         rm: rtmp2.to_reg(),
2841                         cond: Cond::Ne,
2842                     });
2843                 }
2844 
2845                 let cvt = match (in_bits, out_bits, out_signed) {
2846                     (32, 32, false) => FpuToIntOp::F32ToU32,
2847                     (32, 32, true) => FpuToIntOp::F32ToI32,
2848                     (32, 64, false) => FpuToIntOp::F32ToU64,
2849                     (32, 64, true) => FpuToIntOp::F32ToI64,
2850                     (64, 32, false) => FpuToIntOp::F64ToU32,
2851                     (64, 32, true) => FpuToIntOp::F64ToI32,
2852                     (64, 64, false) => FpuToIntOp::F64ToU64,
2853                     (64, 64, true) => FpuToIntOp::F64ToI64,
2854                     _ => unreachable!(),
2855                 };
2856                 ctx.emit(Inst::FpuToInt {
2857                     op: cvt,
2858                     rd,
2859                     rn: rtmp2.to_reg(),
2860                 });
2861             }
2862         }
2863 
2864         Opcode::IaddIfcout => {
2865             // This is a two-output instruction that is needed for the
2866             // legalizer's explicit heap-check sequence, among possible other
2867             // uses. Its second output is a flags output only ever meant to
2868             // check for overflow using the
2869             // `backend.unsigned_add_overflow_condition()` condition.
2870             //
2871             // Note that the CLIF validation will ensure that no flag-setting
2872             // operation comes between this IaddIfcout and its use (e.g., a
2873             // Trapif). Thus, we can rely on implicit communication through the
2874             // processor flags rather than explicitly generating flags into a
2875             // register. We simply use the variant of the add instruction that
2876             // sets flags (`adds`) here.
2877 
2878             // Note that the second output (the flags) need not be generated,
2879             // because flags are never materialized into a register; the only
2880             // instructions that can use a value of type `iflags` or `fflags`
2881             // will look directly for the flags-producing instruction (which can
2882             // always be found, by construction) and merge it.
2883 
2884             // Now handle the iadd as above, except use an AddS opcode that sets
2885             // flags.
2886             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2887             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2888             let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
2889             let ty = ty.unwrap();
2890             let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
2891             ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
2892         }
2893 
2894         Opcode::IaddImm
2895         | Opcode::ImulImm
2896         | Opcode::UdivImm
2897         | Opcode::SdivImm
2898         | Opcode::UremImm
2899         | Opcode::SremImm
2900         | Opcode::IrsubImm
2901         | Opcode::IaddCin
2902         | Opcode::IaddIfcin
2903         | Opcode::IaddCout
2904         | Opcode::IaddCarry
2905         | Opcode::IaddIfcarry
2906         | Opcode::IsubBin
2907         | Opcode::IsubIfbin
2908         | Opcode::IsubBout
2909         | Opcode::IsubIfbout
2910         | Opcode::IsubBorrow
2911         | Opcode::IsubIfborrow
2912         | Opcode::BandImm
2913         | Opcode::BorImm
2914         | Opcode::BxorImm
2915         | Opcode::RotlImm
2916         | Opcode::RotrImm
2917         | Opcode::IshlImm
2918         | Opcode::UshrImm
2919         | Opcode::SshrImm
2920         | Opcode::IcmpImm
2921         | Opcode::IfcmpImm => {
2922             panic!("ALU+imm and ALU+carry ops should not appear here!");
2923         }
2924 
2925         #[cfg(feature = "x86")]
2926         Opcode::X86Udivmodx
2927         | Opcode::X86Sdivmodx
2928         | Opcode::X86Umulx
2929         | Opcode::X86Smulx
2930         | Opcode::X86Cvtt2si
2931         | Opcode::X86Fmin
2932         | Opcode::X86Fmax
2933         | Opcode::X86Push
2934         | Opcode::X86Pop
2935         | Opcode::X86Bsr
2936         | Opcode::X86Bsf
2937         | Opcode::X86Pblendw
2938         | Opcode::X86Pshufd
2939         | Opcode::X86Pshufb
2940         | Opcode::X86Pextr
2941         | Opcode::X86Pinsr
2942         | Opcode::X86Insertps
2943         | Opcode::X86Movsd
2944         | Opcode::X86Movlhps
2945         | Opcode::X86Palignr
2946         | Opcode::X86Psll
2947         | Opcode::X86Psrl
2948         | Opcode::X86Psra
2949         | Opcode::X86Ptest
2950         | Opcode::X86Pmaxs
2951         | Opcode::X86Pmaxu
2952         | Opcode::X86Pmins
2953         | Opcode::X86Pminu
2954         | Opcode::X86Pmullq
2955         | Opcode::X86Pmuludq
2956         | Opcode::X86Punpckh
2957         | Opcode::X86Punpckl
2958         | Opcode::X86Vcvtudq2ps
2959         | Opcode::X86ElfTlsGetAddr
2960         | Opcode::X86MachoTlsGetAddr => {
2961             panic!("x86-specific opcode in supposedly arch-neutral IR!");
2962         }
2963 
2964         Opcode::DummySargT => unreachable!(),
2965 
2966         Opcode::Iabs => {
2967             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2968             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2969             let ty = ty.unwrap();
2970             ctx.emit(Inst::VecMisc {
2971                 op: VecMisc2::Abs,
2972                 rd,
2973                 rn,
2974                 size: VectorSize::from_ty(ty),
2975             });
2976         }
2977         Opcode::AvgRound => {
2978             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2979             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2980             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2981             let ty = ty.unwrap();
2982             ctx.emit(Inst::VecRRR {
2983                 alu_op: VecALUOp::Urhadd,
2984                 rd,
2985                 rn,
2986                 rm,
2987                 size: VectorSize::from_ty(ty),
2988             });
2989         }
2990 
2991         Opcode::Snarrow | Opcode::Unarrow => {
2992             let op = if op == Opcode::Snarrow {
2993                 VecMiscNarrowOp::Sqxtn
2994             } else {
2995                 VecMiscNarrowOp::Sqxtun
2996             };
2997             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2998             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2999             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3000             let ty = ty.unwrap();
3001 
3002             ctx.emit(Inst::VecMiscNarrow {
3003                 op,
3004                 rd,
3005                 rn,
3006                 size: VectorSize::from_ty(ty),
3007                 high_half: false,
3008             });
3009             ctx.emit(Inst::VecMiscNarrow {
3010                 op,
3011                 rd,
3012                 rn: rn2,
3013                 size: VectorSize::from_ty(ty),
3014                 high_half: true,
3015             });
3016         }
3017 
3018         Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
3019             let lane_type = ty.unwrap().lane_type();
3020             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3021             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3022             let (t, high_half) = match (lane_type, op) {
3023                 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
3024                 (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
3025                 (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
3026                 (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
3027                 (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
3028                 (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
3029                 (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
3030                 (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
3031                 _ => {
3032                     return Err(CodegenError::Unsupported(format!(
3033                         "Unsupported SIMD vector lane type: {:?}",
3034                         lane_type
3035                     )));
3036                 }
3037             };
3038 
3039             ctx.emit(Inst::VecExtend {
3040                 t,
3041                 rd,
3042                 rn,
3043                 high_half,
3044             });
3045         }
3046 
3047         Opcode::TlsValue => unimplemented!("tls_value"),
3048         Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
3049     }
3050 
3051     Ok(())
3052 }
3053 
lower_branch<C: LowerCtx<I = Inst>>( ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>3054 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
3055     ctx: &mut C,
3056     branches: &[IRInst],
3057     targets: &[MachLabel],
3058 ) -> CodegenResult<()> {
3059     // A block should end with at most two branches. The first may be a
3060     // conditional branch; a conditional branch can be followed only by an
3061     // unconditional branch or fallthrough. Otherwise, if only one branch,
3062     // it may be an unconditional branch, a fallthrough, a return, or a
3063     // trap. These conditions are verified by `is_ebb_basic()` during the
3064     // verifier pass.
3065     assert!(branches.len() <= 2);
3066 
3067     if branches.len() == 2 {
3068         // Must be a conditional branch followed by an unconditional branch.
3069         let op0 = ctx.data(branches[0]).opcode();
3070         let op1 = ctx.data(branches[1]).opcode();
3071 
3072         assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
3073         let taken = BranchTarget::Label(targets[0]);
3074         // not_taken target is the target of the second branch, even if it is a Fallthrough
3075         // instruction: because we reorder blocks while we lower, the fallthrough in the new
3076         // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
3077         // explicitly-provided target.
3078         let not_taken = BranchTarget::Label(targets[1]);
3079 
3080         match op0 {
3081             Opcode::Brz | Opcode::Brnz => {
3082                 let flag_input = InsnInput {
3083                     insn: branches[0],
3084                     input: 0,
3085                 };
3086                 if let Some(icmp_insn) =
3087                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
3088                 {
3089                     let condcode = ctx.data(icmp_insn).cond_code().unwrap();
3090                     let cond = lower_condcode(condcode);
3091                     let is_signed = condcode_is_signed(condcode);
3092                     let negated = op0 == Opcode::Brz;
3093                     let cond = if negated { cond.invert() } else { cond };
3094 
3095                     lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
3096                     ctx.emit(Inst::CondBr {
3097                         taken,
3098                         not_taken,
3099                         kind: CondBrKind::Cond(cond),
3100                     });
3101                 } else if let Some(fcmp_insn) =
3102                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
3103                 {
3104                     let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
3105                     let cond = lower_fp_condcode(condcode);
3106                     let negated = op0 == Opcode::Brz;
3107                     let cond = if negated { cond.invert() } else { cond };
3108 
3109                     lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
3110                     ctx.emit(Inst::CondBr {
3111                         taken,
3112                         not_taken,
3113                         kind: CondBrKind::Cond(cond),
3114                     });
3115                 } else {
3116                     let rt = put_input_in_reg(
3117                         ctx,
3118                         InsnInput {
3119                             insn: branches[0],
3120                             input: 0,
3121                         },
3122                         NarrowValueMode::ZeroExtend64,
3123                     );
3124                     let kind = match op0 {
3125                         Opcode::Brz => CondBrKind::Zero(rt),
3126                         Opcode::Brnz => CondBrKind::NotZero(rt),
3127                         _ => unreachable!(),
3128                     };
3129                     ctx.emit(Inst::CondBr {
3130                         taken,
3131                         not_taken,
3132                         kind,
3133                     });
3134                 }
3135             }
3136             Opcode::BrIcmp => {
3137                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3138                 let cond = lower_condcode(condcode);
3139                 let kind = CondBrKind::Cond(cond);
3140 
3141                 let is_signed = condcode_is_signed(condcode);
3142                 let ty = ctx.input_ty(branches[0], 0);
3143                 let bits = ty_bits(ty);
3144                 let narrow_mode = match (bits <= 32, is_signed) {
3145                     (true, true) => NarrowValueMode::SignExtend32,
3146                     (true, false) => NarrowValueMode::ZeroExtend32,
3147                     (false, true) => NarrowValueMode::SignExtend64,
3148                     (false, false) => NarrowValueMode::ZeroExtend64,
3149                 };
3150                 let rn = put_input_in_reg(
3151                     ctx,
3152                     InsnInput {
3153                         insn: branches[0],
3154                         input: 0,
3155                     },
3156                     narrow_mode,
3157                 );
3158                 let rm = put_input_in_rse_imm12(
3159                     ctx,
3160                     InsnInput {
3161                         insn: branches[0],
3162                         input: 1,
3163                     },
3164                     narrow_mode,
3165                 );
3166 
3167                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
3168                 let rd = writable_zero_reg();
3169                 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
3170                 ctx.emit(Inst::CondBr {
3171                     taken,
3172                     not_taken,
3173                     kind,
3174                 });
3175             }
3176 
3177             Opcode::Brif => {
3178                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3179                 let cond = lower_condcode(condcode);
3180                 let kind = CondBrKind::Cond(cond);
3181 
3182                 let is_signed = condcode_is_signed(condcode);
3183                 let flag_input = InsnInput {
3184                     insn: branches[0],
3185                     input: 0,
3186                 };
3187                 if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
3188                     lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
3189                     ctx.emit(Inst::CondBr {
3190                         taken,
3191                         not_taken,
3192                         kind,
3193                     });
3194                 } else {
3195                     // If the ifcmp result is actually placed in a
3196                     // register, we need to move it back into the flags.
3197                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3198                     ctx.emit(Inst::MovToNZCV { rn });
3199                     ctx.emit(Inst::CondBr {
3200                         taken,
3201                         not_taken,
3202                         kind,
3203                     });
3204                 }
3205             }
3206 
3207             Opcode::Brff => {
3208                 let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
3209                 let cond = lower_fp_condcode(condcode);
3210                 let kind = CondBrKind::Cond(cond);
3211                 let flag_input = InsnInput {
3212                     insn: branches[0],
3213                     input: 0,
3214                 };
3215                 if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
3216                     lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
3217                     ctx.emit(Inst::CondBr {
3218                         taken,
3219                         not_taken,
3220                         kind,
3221                     });
3222                 } else {
3223                     // If the ffcmp result is actually placed in a
3224                     // register, we need to move it back into the flags.
3225                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3226                     ctx.emit(Inst::MovToNZCV { rn });
3227                     ctx.emit(Inst::CondBr {
3228                         taken,
3229                         not_taken,
3230                         kind,
3231                     });
3232                 }
3233             }
3234 
3235             _ => unimplemented!(),
3236         }
3237     } else {
3238         // Must be an unconditional branch or an indirect branch.
3239         let op = ctx.data(branches[0]).opcode();
3240         match op {
3241             Opcode::Jump | Opcode::Fallthrough => {
3242                 assert!(branches.len() == 1);
3243                 // In the Fallthrough case, the machine-independent driver
3244                 // fills in `targets[0]` with our fallthrough block, so this
3245                 // is valid for both Jump and Fallthrough.
3246                 ctx.emit(Inst::Jump {
3247                     dest: BranchTarget::Label(targets[0]),
3248                 });
3249             }
3250 
3251             Opcode::BrTable => {
3252                 // Expand `br_table index, default, JT` to:
3253                 //
3254                 //   emit_island  // this forces an island at this point
3255                 //                // if the jumptable would push us past
3256                 //                // the deadline
3257                 //   subs idx, #jt_size
3258                 //   b.hs default
3259                 //   adr vTmp1, PC+16
3260                 //   ldr vTmp2, [vTmp1, idx, lsl #2]
3261                 //   add vTmp2, vTmp2, vTmp1
3262                 //   br vTmp2
3263                 //   [jumptable offsets relative to JT base]
3264                 let jt_size = targets.len() - 1;
3265                 assert!(jt_size <= std::u32::MAX as usize);
3266 
3267                 ctx.emit(Inst::EmitIsland {
3268                     needed_space: 4 * (6 + jt_size) as CodeOffset,
3269                 });
3270 
3271                 let ridx = put_input_in_reg(
3272                     ctx,
3273                     InsnInput {
3274                         insn: branches[0],
3275                         input: 0,
3276                     },
3277                     NarrowValueMode::ZeroExtend32,
3278                 );
3279 
3280                 let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
3281                 let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
3282 
3283                 // Bounds-check, leaving condition codes for JTSequence's
3284                 // branch to default target below.
3285                 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
3286                     ctx.emit(Inst::AluRRImm12 {
3287                         alu_op: ALUOp::SubS32,
3288                         rd: writable_zero_reg(),
3289                         rn: ridx,
3290                         imm12,
3291                     });
3292                 } else {
3293                     lower_constant_u64(ctx, rtmp1, jt_size as u64);
3294                     ctx.emit(Inst::AluRRR {
3295                         alu_op: ALUOp::SubS32,
3296                         rd: writable_zero_reg(),
3297                         rn: ridx,
3298                         rm: rtmp1.to_reg(),
3299                     });
3300                 }
3301 
3302                 // Emit the compound instruction that does:
3303                 //
3304                 // b.hs default
3305                 // adr rA, jt
3306                 // ldrsw rB, [rA, rIndex, UXTW 2]
3307                 // add rA, rA, rB
3308                 // br rA
3309                 // [jt entries]
3310                 //
3311                 // This must be *one* instruction in the vcode because
3312                 // we cannot allow regalloc to insert any spills/fills
3313                 // in the middle of the sequence; otherwise, the ADR's
3314                 // PC-rel offset to the jumptable would be incorrect.
3315                 // (The alternative is to introduce a relocation pass
3316                 // for inlined jumptables, which is much worse, IMHO.)
3317 
3318                 let jt_targets: Vec<BranchTarget> = targets
3319                     .iter()
3320                     .skip(1)
3321                     .map(|bix| BranchTarget::Label(*bix))
3322                     .collect();
3323                 let default_target = BranchTarget::Label(targets[0]);
3324                 let targets_for_term: Vec<MachLabel> = targets.to_vec();
3325                 ctx.emit(Inst::JTSequence {
3326                     ridx,
3327                     rtmp1,
3328                     rtmp2,
3329                     info: Box::new(JTSequenceInfo {
3330                         targets: jt_targets,
3331                         default_target,
3332                         targets_for_term,
3333                     }),
3334                 });
3335             }
3336 
3337             _ => panic!("Unknown branch type!"),
3338         }
3339     }
3340 
3341     Ok(())
3342 }
3343