1 //! Lowering rules for X64.
2 
3 use crate::data_value::DataValue;
4 use crate::ir::{
5     condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
6     Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
7 };
8 use crate::isa::x64::abi::*;
9 use crate::isa::x64::inst::args::*;
10 use crate::isa::x64::inst::*;
11 use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
12 use crate::machinst::lower::*;
13 use crate::machinst::*;
14 use crate::result::CodegenResult;
15 use crate::settings::{Flags, TlsModel};
16 use alloc::boxed::Box;
17 use alloc::vec::Vec;
18 use cranelift_codegen_shared::condcodes::CondCode;
19 use log::trace;
20 use regalloc::{Reg, RegClass, Writable};
21 use smallvec::{smallvec, SmallVec};
22 use std::convert::TryFrom;
23 use target_lexicon::Triple;
24 
25 //=============================================================================
26 // Helpers for instruction lowering.
27 
is_int_or_ref_ty(ty: Type) -> bool28 fn is_int_or_ref_ty(ty: Type) -> bool {
29     match ty {
30         types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
31         types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
32         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
33         _ => false,
34     }
35 }
36 
is_bool_ty(ty: Type) -> bool37 fn is_bool_ty(ty: Type) -> bool {
38     match ty {
39         types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
40         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
41         _ => false,
42     }
43 }
44 
45 /// This is target-word-size dependent.  And it excludes booleans and reftypes.
is_valid_atomic_transaction_ty(ty: Type) -> bool46 fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
47     match ty {
48         types::I8 | types::I16 | types::I32 | types::I64 => true,
49         _ => false,
50     }
51 }
52 
53 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
54 /// `op`.
55 // TODO investigate failures with checking against the result index.
matches_input<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, op: Opcode, ) -> Option<IRInst>56 fn matches_input<C: LowerCtx<I = Inst>>(
57     ctx: &mut C,
58     input: InsnInput,
59     op: Opcode,
60 ) -> Option<IRInst> {
61     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
62     inputs.inst.and_then(|(src_inst, _)| {
63         let data = ctx.data(src_inst);
64         if data.opcode() == op {
65             return Some(src_inst);
66         }
67         None
68     })
69 }
70 
71 /// Returns whether the given specified `input` is a result produced by an instruction with any of
72 /// the opcodes specified in `ops`.
matches_input_any<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ops: &[Opcode], ) -> Option<IRInst>73 fn matches_input_any<C: LowerCtx<I = Inst>>(
74     ctx: &mut C,
75     input: InsnInput,
76     ops: &[Opcode],
77 ) -> Option<IRInst> {
78     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
79     inputs.inst.and_then(|(src_inst, _)| {
80         let data = ctx.data(src_inst);
81         for &op in ops {
82             if data.opcode() == op {
83                 return Some(src_inst);
84             }
85         }
86         None
87     })
88 }
89 
90 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
91 /// temporary register, returning that register.
generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg>92 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
93     let from_bits = ty_bits(ty);
94     let masked = if from_bits < 64 {
95         c & ((1u64 << from_bits) - 1)
96     } else {
97         c
98     };
99 
100     let cst_copy = ctx.alloc_tmp(ty);
101     for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
102         ctx.alloc_tmp(ty).only_reg().unwrap()
103     })
104     .into_iter()
105     {
106         ctx.emit(inst);
107     }
108     non_writable_value_regs(cst_copy)
109 }
110 
111 /// Put the given input into possibly multiple registers, and mark it as used (side-effect).
put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg>112 fn put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg> {
113     let ty = ctx.input_ty(spec.insn, spec.input);
114     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
115 
116     if let Some(c) = input.constant {
117         // Generate constants fresh at each use to minimize long-range register pressure.
118         generate_constant(ctx, ty, c)
119     } else {
120         ctx.put_input_in_regs(spec.insn, spec.input)
121     }
122 }
123 
124 /// Put the given input into a register, and mark it as used (side-effect).
put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg125 fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
126     put_input_in_regs(ctx, spec)
127         .only_reg()
128         .expect("Multi-register value not expected")
129 }
130 
131 /// Determines whether a load operation (indicated by `src_insn`) can be merged
132 /// into the current lowering point. If so, returns the address-base source (as
133 /// an `InsnInput`) and an offset from that address from which to perform the
134 /// load.
is_mergeable_load<C: LowerCtx<I = Inst>>( ctx: &mut C, src_insn: IRInst, ) -> Option<(InsnInput, i32)>135 fn is_mergeable_load<C: LowerCtx<I = Inst>>(
136     ctx: &mut C,
137     src_insn: IRInst,
138 ) -> Option<(InsnInput, i32)> {
139     let insn_data = ctx.data(src_insn);
140     let inputs = ctx.num_inputs(src_insn);
141     if inputs != 1 {
142         return None;
143     }
144 
145     let load_ty = ctx.output_ty(src_insn, 0);
146     if ty_bits(load_ty) < 32 {
147         // Narrower values are handled by ALU insts that are at least 32 bits
148         // wide, which is normally OK as we ignore upper buts; but, if we
149         // generate, e.g., a direct-from-memory 32-bit add for a byte value and
150         // the byte is the last byte in a page, the extra data that we load is
151         // incorrectly accessed. So we only allow loads to merge for
152         // 32-bit-and-above widths.
153         return None;
154     }
155 
156     // Just testing the opcode is enough, because the width will always match if
157     // the type does (and the type should match if the CLIF is properly
158     // constructed).
159     if insn_data.opcode() == Opcode::Load {
160         let offset = insn_data
161             .load_store_offset()
162             .expect("load should have offset");
163         Some((
164             InsnInput {
165                 insn: src_insn,
166                 input: 0,
167             },
168             offset,
169         ))
170     } else {
171         None
172     }
173 }
174 
175 /// Put the given input into a register or a memory operand.
176 /// Effectful: may mark the given input as used, when returning the register form.
input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem177 fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
178     let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
179 
180     if let Some(c) = inputs.constant {
181         // Generate constants fresh at each use to minimize long-range register pressure.
182         let ty = ctx.input_ty(spec.insn, spec.input);
183         return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
184     }
185 
186     if let Some((src_insn, 0)) = inputs.inst {
187         if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
188             ctx.sink_inst(src_insn);
189             let amode = lower_to_amode(ctx, addr_input, offset);
190             return RegMem::mem(amode);
191         }
192     }
193 
194     RegMem::reg(
195         ctx.put_input_in_regs(spec.insn, spec.input)
196             .only_reg()
197             .unwrap(),
198     )
199 }
200 
201 /// An extension specification for `extend_input_to_reg`.
202 #[derive(Clone, Copy)]
203 enum ExtSpec {
204     ZeroExtendTo32,
205     ZeroExtendTo64,
206     SignExtendTo32,
207     #[allow(dead_code)] // not used just yet but may be used in the future!
208     SignExtendTo64,
209 }
210 
211 /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
212 /// required. (This obviously causes side-effects.)
extend_input_to_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, spec: InsnInput, ext_spec: ExtSpec, ) -> Reg213 fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
214     ctx: &mut C,
215     spec: InsnInput,
216     ext_spec: ExtSpec,
217 ) -> Reg {
218     let requested_size = match ext_spec {
219         ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
220         ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
221     };
222     let input_size = ctx.input_ty(spec.insn, spec.input).bits();
223 
224     let requested_ty = if requested_size == 32 {
225         types::I32
226     } else {
227         types::I64
228     };
229 
230     let ext_mode = match (input_size, requested_size) {
231         (a, b) if a == b => return put_input_in_reg(ctx, spec),
232         (1, 8) => return put_input_in_reg(ctx, spec),
233         (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)),
234     };
235 
236     let src = input_to_reg_mem(ctx, spec);
237     let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
238     match ext_spec {
239         ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
240             ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
241         }
242         ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
243             ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
244         }
245     }
246     dst.to_reg()
247 }
248 
249 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
250 /// possible side-effect.
non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>251 fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
252     input.constant.and_then(|x| {
253         // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
254         // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
255         // constant.
256         if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
257             Some(x as u32)
258         } else {
259             None
260         }
261     })
262 }
263 
input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64>264 fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
265     ctx.get_input_as_source_or_const(spec.insn, spec.input)
266         .constant
267 }
268 
269 /// Put the given input into an immediate, a register or a memory operand.
270 /// Effectful: may mark the given input as used, when returning the register form.
input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm271 fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
272     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
273     let input_ty = ctx.input_ty(spec.insn, spec.input);
274     match non_reg_input_to_sext_imm(input, input_ty) {
275         Some(x) => RegMemImm::imm(x),
276         None => match input_to_reg_mem(ctx, spec) {
277             RegMem::Reg { reg } => RegMemImm::reg(reg),
278             RegMem::Mem { addr } => RegMemImm::mem(addr),
279         },
280     }
281 }
282 
283 /// Emit an instruction to insert a value `src` into a lane of `dst`.
emit_insert_lane<C: LowerCtx<I = Inst>>( ctx: &mut C, src: RegMem, dst: Writable<Reg>, lane: u8, ty: Type, )284 fn emit_insert_lane<C: LowerCtx<I = Inst>>(
285     ctx: &mut C,
286     src: RegMem,
287     dst: Writable<Reg>,
288     lane: u8,
289     ty: Type,
290 ) {
291     if !ty.is_float() {
292         let (sse_op, size) = match ty.lane_bits() {
293             8 => (SseOpcode::Pinsrb, OperandSize::Size32),
294             16 => (SseOpcode::Pinsrw, OperandSize::Size32),
295             32 => (SseOpcode::Pinsrd, OperandSize::Size32),
296             64 => (SseOpcode::Pinsrd, OperandSize::Size64),
297             _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
298         };
299         ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
300     } else if ty == types::F32 {
301         let sse_op = SseOpcode::Insertps;
302         // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
303         // shifted into bits 5:6).
304         let lane = 0b00_00_00_00 | lane << 4;
305         ctx.emit(Inst::xmm_rm_r_imm(
306             sse_op,
307             src,
308             dst,
309             lane,
310             OperandSize::Size32,
311         ));
312     } else if ty == types::F64 {
313         let sse_op = match lane {
314             // Move the lowest quadword in replacement to vector without changing
315             // the upper bits.
316             0 => SseOpcode::Movsd,
317             // Move the low 64 bits of replacement vector to the high 64 bits of the
318             // vector.
319             1 => SseOpcode::Movlhps,
320             _ => unreachable!(),
321         };
322         // Here we use the `xmm_rm_r` encoding because it correctly tells the register
323         // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
324         // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
325         ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
326     } else {
327         panic!("unable to emit insertlane for type: {}", ty)
328     }
329 }
330 
331 /// Emit an instruction to extract a lane of `src` into `dst`.
emit_extract_lane<C: LowerCtx<I = Inst>>( ctx: &mut C, src: Reg, dst: Writable<Reg>, lane: u8, ty: Type, )332 fn emit_extract_lane<C: LowerCtx<I = Inst>>(
333     ctx: &mut C,
334     src: Reg,
335     dst: Writable<Reg>,
336     lane: u8,
337     ty: Type,
338 ) {
339     if !ty.is_float() {
340         let (sse_op, size) = match ty.lane_bits() {
341             8 => (SseOpcode::Pextrb, OperandSize::Size32),
342             16 => (SseOpcode::Pextrw, OperandSize::Size32),
343             32 => (SseOpcode::Pextrd, OperandSize::Size32),
344             64 => (SseOpcode::Pextrd, OperandSize::Size64),
345             _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
346         };
347         let src = RegMem::reg(src);
348         ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
349     } else if ty == types::F32 || ty == types::F64 {
350         if lane == 0 {
351             // Remove the extractlane instruction, leaving the float where it is. The upper
352             // bits will remain unchanged; for correctness, this relies on Cranelift type
353             // checking to avoid using those bits.
354             ctx.emit(Inst::gen_move(dst, src, ty));
355         } else {
356             // Otherwise, shuffle the bits in `lane` to the lowest lane.
357             let sse_op = SseOpcode::Pshufd;
358             let mask = match ty {
359                 // Move the value at `lane` to lane 0, copying existing value at lane 0 to
360                 // other lanes. Again, this relies on Cranelift type checking to avoid
361                 // using those bits.
362                 types::F32 => {
363                     assert!(lane > 0 && lane < 4);
364                     0b00_00_00_00 | lane
365                 }
366                 // Move the value at `lane` 1 (we know it must be 1 because of the `if`
367                 // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
368                 // checking assumption also applies here.
369                 types::F64 => {
370                     assert!(lane == 1);
371                     0b11_10_11_10
372                 }
373                 _ => unreachable!(),
374             };
375             let src = RegMem::reg(src);
376             ctx.emit(Inst::xmm_rm_r_imm(
377                 sse_op,
378                 src,
379                 dst,
380                 mask,
381                 OperandSize::Size32,
382             ));
383         }
384     } else {
385         panic!("unable to emit extractlane for type: {}", ty)
386     }
387 }
388 
389 /// Emits an int comparison instruction.
390 ///
391 /// Note: make sure that there are no instructions modifying the flags between a call to this
392 /// function and the use of the flags!
393 ///
394 /// Takes the condition code that will be tested, and returns
395 /// the condition code that should be used. This allows us to
396 /// synthesize comparisons out of multiple instructions for
397 /// special cases (e.g., 128-bit integers).
emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC398 fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC {
399     let ty = ctx.input_ty(insn, 0);
400 
401     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
402 
403     if ty == types::I128 {
404         // We need to compare both halves and combine the results appropriately.
405         let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
406         let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
407         let lhs = put_input_in_regs(ctx, inputs[0]);
408         let lhs_lo = lhs.regs()[0];
409         let lhs_hi = lhs.regs()[1];
410         let rhs = put_input_in_regs(ctx, inputs[1]);
411         let rhs_lo = RegMemImm::reg(rhs.regs()[0]);
412         let rhs_hi = RegMemImm::reg(rhs.regs()[1]);
413         match cc {
414             IntCC::Equal => {
415                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
416                 ctx.emit(Inst::setcc(CC::Z, cmp1));
417                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
418                 ctx.emit(Inst::setcc(CC::Z, cmp2));
419                 ctx.emit(Inst::alu_rmi_r(
420                     OperandSize::Size64,
421                     AluRmiROpcode::And,
422                     RegMemImm::reg(cmp1.to_reg()),
423                     cmp2,
424                 ));
425                 ctx.emit(Inst::alu_rmi_r(
426                     OperandSize::Size64,
427                     AluRmiROpcode::And,
428                     RegMemImm::imm(1),
429                     cmp2,
430                 ));
431                 IntCC::NotEqual
432             }
433             IntCC::NotEqual => {
434                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
435                 ctx.emit(Inst::setcc(CC::NZ, cmp1));
436                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
437                 ctx.emit(Inst::setcc(CC::NZ, cmp2));
438                 ctx.emit(Inst::alu_rmi_r(
439                     OperandSize::Size64,
440                     AluRmiROpcode::Or,
441                     RegMemImm::reg(cmp1.to_reg()),
442                     cmp2,
443                 ));
444                 ctx.emit(Inst::alu_rmi_r(
445                     OperandSize::Size64,
446                     AluRmiROpcode::And,
447                     RegMemImm::imm(1),
448                     cmp2,
449                 ));
450                 IntCC::NotEqual
451             }
452             IntCC::SignedLessThan
453             | IntCC::SignedLessThanOrEqual
454             | IntCC::SignedGreaterThan
455             | IntCC::SignedGreaterThanOrEqual
456             | IntCC::UnsignedLessThan
457             | IntCC::UnsignedLessThanOrEqual
458             | IntCC::UnsignedGreaterThan
459             | IntCC::UnsignedGreaterThanOrEqual => {
460                 // Result = (lhs_hi <> rhs_hi) ||
461                 //          (lhs_hi == rhs_hi && lhs_lo <> rhs_lo)
462                 let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
463                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
464                 ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1));
465                 ctx.emit(Inst::setcc(CC::Z, cmp2));
466                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
467                 ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3));
468                 ctx.emit(Inst::alu_rmi_r(
469                     OperandSize::Size64,
470                     AluRmiROpcode::And,
471                     RegMemImm::reg(cmp2.to_reg()),
472                     cmp3,
473                 ));
474                 ctx.emit(Inst::alu_rmi_r(
475                     OperandSize::Size64,
476                     AluRmiROpcode::Or,
477                     RegMemImm::reg(cmp1.to_reg()),
478                     cmp3,
479                 ));
480                 ctx.emit(Inst::alu_rmi_r(
481                     OperandSize::Size64,
482                     AluRmiROpcode::And,
483                     RegMemImm::imm(1),
484                     cmp3,
485                 ));
486                 IntCC::NotEqual
487             }
488             _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc),
489         }
490     } else {
491         // TODO Try to commute the operands (and invert the condition) if one is an immediate.
492         let lhs = put_input_in_reg(ctx, inputs[0]);
493         // We force the RHS into a register, and disallow load-op fusion, because we
494         // do not have a transitive guarantee that this cmp-site will be the sole
495         // user of the value. Consider: the icmp might be the only user of a load,
496         // but there may be multiple users of the icmp (e.g.  select or bint
497         // instructions) that each invoke `emit_cmp()`. If we were to allow a load
498         // to sink to the *latest* one, but other sites did not permit sinking, then
499         // we would be missing the load for other cmp-sites.
500         let rhs = put_input_in_reg(ctx, inputs[1]);
501 
502         // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
503         // us dst - src at the machine instruction level, so invert operands.
504         ctx.emit(Inst::cmp_rmi_r(
505             OperandSize::from_ty(ty),
506             RegMemImm::reg(rhs),
507             lhs,
508         ));
509         cc
510     }
511 }
512 
513 /// A specification for a fcmp emission.
514 enum FcmpSpec {
515     /// Normal flow.
516     Normal,
517 
518     /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
519     /// happens with `InvertedEqualOrConditions`.
520     ///
521     /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
522     /// sequence of instructions) that check for an "AND" combination of condition codes; see for
523     /// instance lowering of Select.
524     InvertEqual,
525 }
526 
527 /// This explains how to interpret the results of an fcmp instruction.
528 enum FcmpCondResult {
529     /// The given condition code must be set.
530     Condition(CC),
531 
532     /// Both condition codes must be set.
533     AndConditions(CC, CC),
534 
535     /// Either of the conditions codes must be set.
536     OrConditions(CC, CC),
537 
538     /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
539     /// of the condition codes must be set, and the user must invert meaning of analyzing the
540     /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
541     /// reached.
542     InvertedEqualOrConditions(CC, CC),
543 }
544 
545 /// Emits a float comparison instruction.
546 ///
547 /// Note: make sure that there are no instructions modifying the flags between a call to this
548 /// function and the use of the flags!
emit_fcmp<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec, ) -> FcmpCondResult549 fn emit_fcmp<C: LowerCtx<I = Inst>>(
550     ctx: &mut C,
551     insn: IRInst,
552     mut cond_code: FloatCC,
553     spec: FcmpSpec,
554 ) -> FcmpCondResult {
555     let (flip_operands, inverted_equal) = match cond_code {
556         FloatCC::LessThan
557         | FloatCC::LessThanOrEqual
558         | FloatCC::UnorderedOrGreaterThan
559         | FloatCC::UnorderedOrGreaterThanOrEqual => {
560             cond_code = cond_code.reverse();
561             (true, false)
562         }
563         FloatCC::Equal => {
564             let inverted_equal = match spec {
565                 FcmpSpec::Normal => false,
566                 FcmpSpec::InvertEqual => {
567                     cond_code = FloatCC::NotEqual; // same as .inverse()
568                     true
569                 }
570             };
571             (false, inverted_equal)
572         }
573         _ => (false, false),
574     };
575 
576     // The only valid CC constructed with `from_floatcc` can be put in the flag
577     // register with a direct float comparison; do this here.
578     let op = match ctx.input_ty(insn, 0) {
579         types::F32 => SseOpcode::Ucomiss,
580         types::F64 => SseOpcode::Ucomisd,
581         _ => panic!("Bad input type to Fcmp"),
582     };
583 
584     let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
585     let (lhs_input, rhs_input) = if flip_operands {
586         (inputs[1], inputs[0])
587     } else {
588         (inputs[0], inputs[1])
589     };
590     let lhs = put_input_in_reg(ctx, lhs_input);
591     // See above in `emit_cmp()`. We must only use the reg/reg form of the
592     // comparison in order to avoid issues with merged loads.
593     let rhs = put_input_in_reg(ctx, rhs_input);
594     ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));
595 
596     let cond_result = match cond_code {
597         FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
598         FloatCC::NotEqual if inverted_equal => {
599             FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
600         }
601         FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
602         _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
603     };
604 
605     cond_result
606 }
607 
emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type)608 fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type) {
609     let bits = ty.bits();
610     let const_mask = if bits == 64 {
611         0xffff_ffff_ffff_ffff
612     } else {
613         (1u64 << bits) - 1
614     };
615     let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
616     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
617     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
618 
619     ctx.emit(Inst::gen_move(tmp0, src, types::I64));
620 
621     // Swap 1-bit units.
622     // tmp1 = src
623     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
624     // tmp2 = 0b0101..
625     ctx.emit(Inst::imm(
626         OperandSize::Size64,
627         0x5555_5555_5555_5555 & const_mask,
628         tmp2,
629     ));
630     // tmp1 = src >> 1
631     ctx.emit(Inst::shift_r(
632         OperandSize::Size64,
633         ShiftKind::ShiftRightLogical,
634         Some(1),
635         tmp1,
636     ));
637     // tmp1 = (src >> 1) & 0b0101..
638     ctx.emit(Inst::alu_rmi_r(
639         OperandSize::Size64,
640         AluRmiROpcode::And,
641         RegMemImm::reg(tmp2.to_reg()),
642         tmp1,
643     ));
644     // tmp2 = src & 0b0101..
645     ctx.emit(Inst::alu_rmi_r(
646         OperandSize::Size64,
647         AluRmiROpcode::And,
648         RegMemImm::reg(tmp0.to_reg()),
649         tmp2,
650     ));
651     // tmp2 = (src & 0b0101..) << 1
652     ctx.emit(Inst::shift_r(
653         OperandSize::Size64,
654         ShiftKind::ShiftLeft,
655         Some(1),
656         tmp2,
657     ));
658     // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1
659     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
660     ctx.emit(Inst::alu_rmi_r(
661         OperandSize::Size64,
662         AluRmiROpcode::Or,
663         RegMemImm::reg(tmp1.to_reg()),
664         tmp0,
665     ));
666 
667     // Swap 2-bit units.
668     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
669     ctx.emit(Inst::imm(
670         OperandSize::Size64,
671         0x3333_3333_3333_3333 & const_mask,
672         tmp2,
673     ));
674     ctx.emit(Inst::shift_r(
675         OperandSize::Size64,
676         ShiftKind::ShiftRightLogical,
677         Some(2),
678         tmp1,
679     ));
680     ctx.emit(Inst::alu_rmi_r(
681         OperandSize::Size64,
682         AluRmiROpcode::And,
683         RegMemImm::reg(tmp2.to_reg()),
684         tmp1,
685     ));
686     ctx.emit(Inst::alu_rmi_r(
687         OperandSize::Size64,
688         AluRmiROpcode::And,
689         RegMemImm::reg(tmp0.to_reg()),
690         tmp2,
691     ));
692     ctx.emit(Inst::shift_r(
693         OperandSize::Size64,
694         ShiftKind::ShiftLeft,
695         Some(2),
696         tmp2,
697     ));
698     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
699     ctx.emit(Inst::alu_rmi_r(
700         OperandSize::Size64,
701         AluRmiROpcode::Or,
702         RegMemImm::reg(tmp1.to_reg()),
703         tmp0,
704     ));
705 
706     // Swap 4-bit units.
707     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
708     ctx.emit(Inst::imm(
709         OperandSize::Size64,
710         0x0f0f_0f0f_0f0f_0f0f & const_mask,
711         tmp2,
712     ));
713     ctx.emit(Inst::shift_r(
714         OperandSize::Size64,
715         ShiftKind::ShiftRightLogical,
716         Some(4),
717         tmp1,
718     ));
719     ctx.emit(Inst::alu_rmi_r(
720         OperandSize::Size64,
721         AluRmiROpcode::And,
722         RegMemImm::reg(tmp2.to_reg()),
723         tmp1,
724     ));
725     ctx.emit(Inst::alu_rmi_r(
726         OperandSize::Size64,
727         AluRmiROpcode::And,
728         RegMemImm::reg(tmp0.to_reg()),
729         tmp2,
730     ));
731     ctx.emit(Inst::shift_r(
732         OperandSize::Size64,
733         ShiftKind::ShiftLeft,
734         Some(4),
735         tmp2,
736     ));
737     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
738     ctx.emit(Inst::alu_rmi_r(
739         OperandSize::Size64,
740         AluRmiROpcode::Or,
741         RegMemImm::reg(tmp1.to_reg()),
742         tmp0,
743     ));
744 
745     if bits > 8 {
746         // Swap 8-bit units.
747         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
748         ctx.emit(Inst::imm(
749             OperandSize::Size64,
750             0x00ff_00ff_00ff_00ff & const_mask,
751             tmp2,
752         ));
753         ctx.emit(Inst::shift_r(
754             OperandSize::Size64,
755             ShiftKind::ShiftRightLogical,
756             Some(8),
757             tmp1,
758         ));
759         ctx.emit(Inst::alu_rmi_r(
760             OperandSize::Size64,
761             AluRmiROpcode::And,
762             RegMemImm::reg(tmp2.to_reg()),
763             tmp1,
764         ));
765         ctx.emit(Inst::alu_rmi_r(
766             OperandSize::Size64,
767             AluRmiROpcode::And,
768             RegMemImm::reg(tmp0.to_reg()),
769             tmp2,
770         ));
771         ctx.emit(Inst::shift_r(
772             OperandSize::Size64,
773             ShiftKind::ShiftLeft,
774             Some(8),
775             tmp2,
776         ));
777         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
778         ctx.emit(Inst::alu_rmi_r(
779             OperandSize::Size64,
780             AluRmiROpcode::Or,
781             RegMemImm::reg(tmp1.to_reg()),
782             tmp0,
783         ));
784     }
785 
786     if bits > 16 {
787         // Swap 16-bit units.
788         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
789         ctx.emit(Inst::imm(
790             OperandSize::Size64,
791             0x0000_ffff_0000_ffff & const_mask,
792             tmp2,
793         ));
794         ctx.emit(Inst::shift_r(
795             OperandSize::Size64,
796             ShiftKind::ShiftRightLogical,
797             Some(16),
798             tmp1,
799         ));
800         ctx.emit(Inst::alu_rmi_r(
801             OperandSize::Size64,
802             AluRmiROpcode::And,
803             RegMemImm::reg(tmp2.to_reg()),
804             tmp1,
805         ));
806         ctx.emit(Inst::alu_rmi_r(
807             OperandSize::Size64,
808             AluRmiROpcode::And,
809             RegMemImm::reg(tmp0.to_reg()),
810             tmp2,
811         ));
812         ctx.emit(Inst::shift_r(
813             OperandSize::Size64,
814             ShiftKind::ShiftLeft,
815             Some(16),
816             tmp2,
817         ));
818         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
819         ctx.emit(Inst::alu_rmi_r(
820             OperandSize::Size64,
821             AluRmiROpcode::Or,
822             RegMemImm::reg(tmp1.to_reg()),
823             tmp0,
824         ));
825     }
826 
827     if bits > 32 {
828         // Swap 32-bit units.
829         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
830         ctx.emit(Inst::imm(
831             OperandSize::Size64,
832             0x0000_0000_ffff_ffff & const_mask,
833             tmp2,
834         ));
835         ctx.emit(Inst::shift_r(
836             OperandSize::Size64,
837             ShiftKind::ShiftRightLogical,
838             Some(32),
839             tmp1,
840         ));
841         ctx.emit(Inst::alu_rmi_r(
842             OperandSize::Size64,
843             AluRmiROpcode::And,
844             RegMemImm::reg(tmp2.to_reg()),
845             tmp1,
846         ));
847         ctx.emit(Inst::alu_rmi_r(
848             OperandSize::Size64,
849             AluRmiROpcode::And,
850             RegMemImm::reg(tmp0.to_reg()),
851             tmp2,
852         ));
853         ctx.emit(Inst::shift_r(
854             OperandSize::Size64,
855             ShiftKind::ShiftLeft,
856             Some(32),
857             tmp2,
858         ));
859         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
860         ctx.emit(Inst::alu_rmi_r(
861             OperandSize::Size64,
862             AluRmiROpcode::Or,
863             RegMemImm::reg(tmp1.to_reg()),
864             tmp0,
865         ));
866     }
867 
868     ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
869 }
870 
emit_shl_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt_src: Reg, )871 fn emit_shl_i128<C: LowerCtx<I = Inst>>(
872     ctx: &mut C,
873     src: ValueRegs<Reg>,
874     dst: ValueRegs<Writable<Reg>>,
875     amt_src: Reg,
876 ) {
877     let src_lo = src.regs()[0];
878     let src_hi = src.regs()[1];
879     let dst_lo = dst.regs()[0];
880     let dst_hi = dst.regs()[1];
881 
882     // mov tmp1, src_lo
883     // shl tmp1, amt_src
884     // mov tmp2, src_hi
885     // shl tmp2, amt_src
886     // mov amt, 64
887     // sub amt, amt_src
888     // mov tmp3, src_lo
889     // shr tmp3, amt
890     // xor dst_lo, dst_lo
891     // test amt_src, 127
892     // cmovz tmp3, dst_lo
893     // or tmp3, tmp2
894     // mov amt, amt_src
895     // and amt, 64
896     // cmovz dst_hi, tmp3
897     // cmovz dst_lo, tmp1
898     // cmovnz dst_hi, tmp1
899 
900     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
901     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
902     let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
903     let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
904 
905     ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
906     ctx.emit(Inst::gen_move(
907         Writable::from_reg(regs::rcx()),
908         amt_src,
909         types::I64,
910     ));
911     ctx.emit(Inst::shift_r(
912         OperandSize::Size64,
913         ShiftKind::ShiftLeft,
914         None,
915         tmp1,
916     ));
917 
918     ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
919     ctx.emit(Inst::gen_move(
920         Writable::from_reg(regs::rcx()),
921         amt_src,
922         types::I64,
923     ));
924     ctx.emit(Inst::shift_r(
925         OperandSize::Size64,
926         ShiftKind::ShiftLeft,
927         None,
928         tmp2,
929     ));
930 
931     ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
932     ctx.emit(Inst::alu_rmi_r(
933         OperandSize::Size64,
934         AluRmiROpcode::Sub,
935         RegMemImm::reg(amt_src),
936         amt,
937     ));
938 
939     ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
940     ctx.emit(Inst::gen_move(
941         Writable::from_reg(regs::rcx()),
942         amt.to_reg(),
943         types::I64,
944     ));
945     ctx.emit(Inst::shift_r(
946         OperandSize::Size64,
947         ShiftKind::ShiftRightLogical,
948         None,
949         tmp3,
950     ));
951     ctx.emit(Inst::alu_rmi_r(
952         OperandSize::Size64,
953         AluRmiROpcode::Xor,
954         RegMemImm::reg(dst_lo.to_reg()),
955         dst_lo,
956     ));
957 
958     ctx.emit(Inst::test_rmi_r(
959         OperandSize::Size64,
960         RegMemImm::imm(127),
961         amt_src,
962     ));
963     ctx.emit(Inst::cmove(
964         OperandSize::Size64,
965         CC::Z,
966         RegMem::reg(dst_lo.to_reg()),
967         tmp3,
968     ));
969 
970     ctx.emit(Inst::alu_rmi_r(
971         OperandSize::Size64,
972         AluRmiROpcode::Or,
973         RegMemImm::reg(tmp2.to_reg()),
974         tmp3,
975     ));
976 
977     // This isn't semantically necessary, but it keeps the
978     // register allocator happy, because it cannot otherwise
979     // infer that cmovz + cmovnz always defines dst_hi.
980     ctx.emit(Inst::alu_rmi_r(
981         OperandSize::Size64,
982         AluRmiROpcode::Xor,
983         RegMemImm::reg(dst_hi.to_reg()),
984         dst_hi,
985     ));
986 
987     ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
988     ctx.emit(Inst::alu_rmi_r(
989         OperandSize::Size64,
990         AluRmiROpcode::And,
991         RegMemImm::imm(64),
992         amt,
993     ));
994     ctx.emit(Inst::cmove(
995         OperandSize::Size64,
996         CC::Z,
997         RegMem::reg(tmp3.to_reg()),
998         dst_hi,
999     ));
1000     ctx.emit(Inst::cmove(
1001         OperandSize::Size64,
1002         CC::Z,
1003         RegMem::reg(tmp1.to_reg()),
1004         dst_lo,
1005     ));
1006     ctx.emit(Inst::cmove(
1007         OperandSize::Size64,
1008         CC::NZ,
1009         RegMem::reg(tmp1.to_reg()),
1010         dst_hi,
1011     ));
1012 }
1013 
emit_shr_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt_src: Reg, is_signed: bool, )1014 fn emit_shr_i128<C: LowerCtx<I = Inst>>(
1015     ctx: &mut C,
1016     src: ValueRegs<Reg>,
1017     dst: ValueRegs<Writable<Reg>>,
1018     amt_src: Reg,
1019     is_signed: bool,
1020 ) {
1021     let src_lo = src.regs()[0];
1022     let src_hi = src.regs()[1];
1023     let dst_lo = dst.regs()[0];
1024     let dst_hi = dst.regs()[1];
1025 
1026     // mov tmp1, src_hi
1027     // {u,s}shr tmp1, amt_src
1028     // mov tmp2, src_lo
1029     // ushr tmp2, amt_src
1030     // mov amt, 64
1031     // sub amt, amt_src
1032     // mov tmp3, src_hi
1033     // shl tmp3, amt
1034     // xor dst_lo, dst_lo
1035     // test amt_src, 127
1036     // cmovz tmp3, dst_lo
1037     // or tmp3, tmp2
1038     // if is_signed:
1039     //   mov dst_hi, src_hi
1040     //   sshr dst_hi, 63  // get the sign bit
1041     // else:
1042     //   xor dst_hi, dst_hi
1043     // mov amt, amt_src
1044     // and amt, 64
1045     // cmovz dst_hi, tmp1
1046     // cmovz dst_lo, tmp3
1047     // cmovnz dst_lo, tmp1
1048 
1049     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1050     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1051     let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1052     let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1053 
1054     let shift_kind = if is_signed {
1055         ShiftKind::ShiftRightArithmetic
1056     } else {
1057         ShiftKind::ShiftRightLogical
1058     };
1059 
1060     ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
1061     ctx.emit(Inst::gen_move(
1062         Writable::from_reg(regs::rcx()),
1063         amt_src,
1064         types::I64,
1065     ));
1066     ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));
1067 
1068     ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
1069     ctx.emit(Inst::gen_move(
1070         Writable::from_reg(regs::rcx()),
1071         amt_src,
1072         types::I64,
1073     ));
1074     // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
1075     ctx.emit(Inst::shift_r(
1076         OperandSize::Size64,
1077         ShiftKind::ShiftRightLogical,
1078         None,
1079         tmp2,
1080     ));
1081 
1082     ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
1083     ctx.emit(Inst::alu_rmi_r(
1084         OperandSize::Size64,
1085         AluRmiROpcode::Sub,
1086         RegMemImm::reg(amt_src),
1087         amt,
1088     ));
1089 
1090     ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
1091     ctx.emit(Inst::gen_move(
1092         Writable::from_reg(regs::rcx()),
1093         amt.to_reg(),
1094         types::I64,
1095     ));
1096     ctx.emit(Inst::shift_r(
1097         OperandSize::Size64,
1098         ShiftKind::ShiftLeft,
1099         None,
1100         tmp3,
1101     ));
1102 
1103     ctx.emit(Inst::alu_rmi_r(
1104         OperandSize::Size64,
1105         AluRmiROpcode::Xor,
1106         RegMemImm::reg(dst_lo.to_reg()),
1107         dst_lo,
1108     ));
1109     ctx.emit(Inst::test_rmi_r(
1110         OperandSize::Size64,
1111         RegMemImm::imm(127),
1112         amt_src,
1113     ));
1114     ctx.emit(Inst::cmove(
1115         OperandSize::Size64,
1116         CC::Z,
1117         RegMem::reg(dst_lo.to_reg()),
1118         tmp3,
1119     ));
1120 
1121     ctx.emit(Inst::alu_rmi_r(
1122         OperandSize::Size64,
1123         AluRmiROpcode::Or,
1124         RegMemImm::reg(tmp2.to_reg()),
1125         tmp3,
1126     ));
1127 
1128     if is_signed {
1129         ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
1130         ctx.emit(Inst::shift_r(
1131             OperandSize::Size64,
1132             ShiftKind::ShiftRightArithmetic,
1133             Some(63),
1134             dst_hi,
1135         ));
1136     } else {
1137         ctx.emit(Inst::alu_rmi_r(
1138             OperandSize::Size64,
1139             AluRmiROpcode::Xor,
1140             RegMemImm::reg(dst_hi.to_reg()),
1141             dst_hi,
1142         ));
1143     }
1144     // This isn't semantically necessary, but it keeps the
1145     // register allocator happy, because it cannot otherwise
1146     // infer that cmovz + cmovnz always defines dst_lo.
1147     ctx.emit(Inst::alu_rmi_r(
1148         OperandSize::Size64,
1149         AluRmiROpcode::Xor,
1150         RegMemImm::reg(dst_lo.to_reg()),
1151         dst_lo,
1152     ));
1153 
1154     ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
1155     ctx.emit(Inst::alu_rmi_r(
1156         OperandSize::Size64,
1157         AluRmiROpcode::And,
1158         RegMemImm::imm(64),
1159         amt,
1160     ));
1161     ctx.emit(Inst::cmove(
1162         OperandSize::Size64,
1163         CC::Z,
1164         RegMem::reg(tmp1.to_reg()),
1165         dst_hi,
1166     ));
1167     ctx.emit(Inst::cmove(
1168         OperandSize::Size64,
1169         CC::Z,
1170         RegMem::reg(tmp3.to_reg()),
1171         dst_lo,
1172     ));
1173     ctx.emit(Inst::cmove(
1174         OperandSize::Size64,
1175         CC::NZ,
1176         RegMem::reg(tmp1.to_reg()),
1177         dst_lo,
1178     ));
1179 }
1180 
make_libcall_sig<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, call_conv: CallConv, ptr_ty: Type, ) -> Signature1181 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
1182     ctx: &mut C,
1183     insn: IRInst,
1184     call_conv: CallConv,
1185     ptr_ty: Type,
1186 ) -> Signature {
1187     let mut sig = Signature::new(call_conv);
1188     for i in 0..ctx.num_inputs(insn) {
1189         sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
1190     }
1191     for i in 0..ctx.num_outputs(insn) {
1192         sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
1193     }
1194     if call_conv.extends_baldrdash() {
1195         // Adds the special VMContext parameter to the signature.
1196         sig.params
1197             .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
1198     }
1199     sig
1200 }
1201 
emit_vm_call<C: LowerCtx<I = Inst>>( ctx: &mut C, flags: &Flags, triple: &Triple, libcall: LibCall, insn: IRInst, inputs: SmallVec<[InsnInput; 4]>, outputs: SmallVec<[InsnOutput; 2]>, ) -> CodegenResult<()>1202 fn emit_vm_call<C: LowerCtx<I = Inst>>(
1203     ctx: &mut C,
1204     flags: &Flags,
1205     triple: &Triple,
1206     libcall: LibCall,
1207     insn: IRInst,
1208     inputs: SmallVec<[InsnInput; 4]>,
1209     outputs: SmallVec<[InsnOutput; 2]>,
1210 ) -> CodegenResult<()> {
1211     let extname = ExternalName::LibCall(libcall);
1212 
1213     let dist = if flags.use_colocated_libcalls() {
1214         RelocDistance::Near
1215     } else {
1216         RelocDistance::Far
1217     };
1218 
1219     // TODO avoid recreating signatures for every single Libcall function.
1220     let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
1221     let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
1222     let caller_conv = ctx.abi().call_conv();
1223 
1224     let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv, flags)?;
1225 
1226     abi.emit_stack_pre_adjust(ctx);
1227 
1228     let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
1229     assert_eq!(inputs.len() + vm_context, abi.num_args());
1230 
1231     for (i, input) in inputs.iter().enumerate() {
1232         let arg_reg = put_input_in_reg(ctx, *input);
1233         abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
1234     }
1235     if call_conv.extends_baldrdash() {
1236         let vm_context_vreg = ctx
1237             .get_vm_context()
1238             .expect("should have a VMContext to pass to libcall funcs");
1239         abi.emit_copy_regs_to_arg(ctx, inputs.len(), ValueRegs::one(vm_context_vreg));
1240     }
1241 
1242     abi.emit_call(ctx);
1243     for (i, output) in outputs.iter().enumerate() {
1244         let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
1245         abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
1246     }
1247     abi.emit_stack_post_adjust(ctx);
1248 
1249     Ok(())
1250 }
1251 
1252 /// Returns whether the given input is a shift by a constant value less or equal than 3.
1253 /// The goal is to embed it within an address mode.
matches_small_constant_shift<C: LowerCtx<I = Inst>>( ctx: &mut C, spec: InsnInput, ) -> Option<(InsnInput, u8)>1254 fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
1255     ctx: &mut C,
1256     spec: InsnInput,
1257 ) -> Option<(InsnInput, u8)> {
1258     matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
1259         match input_to_imm(
1260             ctx,
1261             InsnInput {
1262                 insn: shift,
1263                 input: 1,
1264             },
1265         ) {
1266             Some(shift_amt) if shift_amt <= 3 => Some((
1267                 InsnInput {
1268                     insn: shift,
1269                     input: 0,
1270                 },
1271                 shift_amt as u8,
1272             )),
1273             _ => None,
1274         }
1275     })
1276 }
1277 
1278 /// Lowers an instruction to one of the x86 addressing modes.
1279 ///
1280 /// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode1281 fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode {
1282     let flags = ctx
1283         .memflags(spec.insn)
1284         .expect("Instruction with amode should have memflags");
1285 
1286     // We now either have an add that we must materialize, or some other input; as well as the
1287     // final offset.
1288     if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
1289         debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
1290         let add_inputs = &[
1291             InsnInput {
1292                 insn: add,
1293                 input: 0,
1294             },
1295             InsnInput {
1296                 insn: add,
1297                 input: 1,
1298             },
1299         ];
1300 
1301         // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
1302         // aren't happening in the wasm case. We could do better, given some range analysis.
1303         let (base, index, shift) = if let Some((shift_input, shift_amt)) =
1304             matches_small_constant_shift(ctx, add_inputs[0])
1305         {
1306             (
1307                 put_input_in_reg(ctx, add_inputs[1]),
1308                 put_input_in_reg(ctx, shift_input),
1309                 shift_amt,
1310             )
1311         } else if let Some((shift_input, shift_amt)) =
1312             matches_small_constant_shift(ctx, add_inputs[1])
1313         {
1314             (
1315                 put_input_in_reg(ctx, add_inputs[0]),
1316                 put_input_in_reg(ctx, shift_input),
1317                 shift_amt,
1318             )
1319         } else {
1320             for i in 0..=1 {
1321                 // Try to pierce through uextend.
1322                 if let Some(uextend) = matches_input(
1323                     ctx,
1324                     InsnInput {
1325                         insn: add,
1326                         input: i,
1327                     },
1328                     Opcode::Uextend,
1329                 ) {
1330                     if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
1331                         // Zero the upper bits.
1332                         let input_size = ctx.input_ty(uextend, 0).bits() as u64;
1333                         let shift: u64 = 64 - input_size;
1334                         let uext_cst: u64 = (cst << shift) >> shift;
1335 
1336                         let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
1337                         if low32_will_sign_extend_to_64(final_offset as u64) {
1338                             let base = put_input_in_reg(ctx, add_inputs[1 - i]);
1339                             return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
1340                         }
1341                     }
1342                 }
1343 
1344                 // If it's a constant, add it directly!
1345                 if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
1346                     let final_offset = (offset as i64).wrapping_add(cst as i64);
1347                     if low32_will_sign_extend_to_64(final_offset as u64) {
1348                         let base = put_input_in_reg(ctx, add_inputs[1 - i]);
1349                         return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
1350                     }
1351                 }
1352             }
1353 
1354             (
1355                 put_input_in_reg(ctx, add_inputs[0]),
1356                 put_input_in_reg(ctx, add_inputs[1]),
1357                 0,
1358             )
1359         };
1360 
1361         return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags);
1362     }
1363 
1364     let input = put_input_in_reg(ctx, spec);
1365     Amode::imm_reg(offset as u32, input).with_flags(flags)
1366 }
1367 
emit_moves<C: LowerCtx<I = Inst>>( ctx: &mut C, dst: ValueRegs<Writable<Reg>>, src: ValueRegs<Reg>, ty: Type, )1368 fn emit_moves<C: LowerCtx<I = Inst>>(
1369     ctx: &mut C,
1370     dst: ValueRegs<Writable<Reg>>,
1371     src: ValueRegs<Reg>,
1372     ty: Type,
1373 ) {
1374     let (_, tys) = Inst::rc_for_type(ty).unwrap();
1375     for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
1376         ctx.emit(Inst::gen_move(*dst, *src, *ty));
1377     }
1378 }
1379 
emit_cmoves<C: LowerCtx<I = Inst>>( ctx: &mut C, size: u8, cc: CC, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, )1380 fn emit_cmoves<C: LowerCtx<I = Inst>>(
1381     ctx: &mut C,
1382     size: u8,
1383     cc: CC,
1384     src: ValueRegs<Reg>,
1385     dst: ValueRegs<Writable<Reg>>,
1386 ) {
1387     let size = size / src.len() as u8;
1388     let size = u8::max(size, 4); // at least 32 bits
1389     for (dst, src) in dst.regs().iter().zip(src.regs().iter()) {
1390         ctx.emit(Inst::cmove(
1391             OperandSize::from_bytes(size.into()),
1392             cc,
1393             RegMem::reg(*src),
1394             *dst,
1395         ));
1396     }
1397 }
1398 
emit_clz<C: LowerCtx<I = Inst>>( ctx: &mut C, orig_ty: Type, ty: Type, src: Reg, dst: Writable<Reg>, )1399 fn emit_clz<C: LowerCtx<I = Inst>>(
1400     ctx: &mut C,
1401     orig_ty: Type,
1402     ty: Type,
1403     src: Reg,
1404     dst: Writable<Reg>,
1405 ) {
1406     let src = RegMem::reg(src);
1407     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1408     ctx.emit(Inst::imm(OperandSize::from_ty(ty), u64::max_value(), dst));
1409 
1410     ctx.emit(Inst::unary_rm_r(
1411         OperandSize::from_ty(ty),
1412         UnaryRmROpcode::Bsr,
1413         src,
1414         tmp,
1415     ));
1416 
1417     ctx.emit(Inst::cmove(
1418         OperandSize::from_ty(ty),
1419         CC::Z,
1420         RegMem::reg(dst.to_reg()),
1421         tmp,
1422     ));
1423 
1424     ctx.emit(Inst::imm(
1425         OperandSize::from_ty(ty),
1426         orig_ty.bits() as u64 - 1,
1427         dst,
1428     ));
1429 
1430     ctx.emit(Inst::alu_rmi_r(
1431         if ty == types::I64 {
1432             OperandSize::Size64
1433         } else {
1434             OperandSize::Size32
1435         },
1436         AluRmiROpcode::Sub,
1437         RegMemImm::reg(tmp.to_reg()),
1438         dst,
1439     ));
1440 }
1441 
emit_ctz<C: LowerCtx<I = Inst>>( ctx: &mut C, orig_ty: Type, ty: Type, src: Reg, dst: Writable<Reg>, )1442 fn emit_ctz<C: LowerCtx<I = Inst>>(
1443     ctx: &mut C,
1444     orig_ty: Type,
1445     ty: Type,
1446     src: Reg,
1447     dst: Writable<Reg>,
1448 ) {
1449     let src = RegMem::reg(src);
1450     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1451     ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp));
1452 
1453     ctx.emit(Inst::unary_rm_r(
1454         OperandSize::from_ty(ty),
1455         UnaryRmROpcode::Bsf,
1456         src,
1457         dst,
1458     ));
1459 
1460     ctx.emit(Inst::cmove(
1461         OperandSize::from_ty(ty),
1462         CC::Z,
1463         RegMem::reg(tmp.to_reg()),
1464         dst,
1465     ));
1466 }
1467 
1468 //=============================================================================
1469 // Top-level instruction lowering entry point, for one instruction.
1470 
1471 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &x64_settings::Flags, triple: &Triple, ) -> CodegenResult<()>1472 fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
1473     ctx: &mut C,
1474     insn: IRInst,
1475     flags: &Flags,
1476     isa_flags: &x64_settings::Flags,
1477     triple: &Triple,
1478 ) -> CodegenResult<()> {
1479     let op = ctx.data(insn).opcode();
1480 
1481     let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
1482         .map(|i| InsnInput { insn, input: i })
1483         .collect();
1484     let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
1485         .map(|i| InsnOutput { insn, output: i })
1486         .collect();
1487 
1488     let ty = if outputs.len() > 0 {
1489         Some(ctx.output_ty(insn, 0))
1490     } else {
1491         None
1492     };
1493 
1494     match op {
1495         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
1496             let value = ctx
1497                 .get_constant(insn)
1498                 .expect("constant value for iconst et al");
1499             let dst = get_output_reg(ctx, outputs[0]);
1500             for inst in Inst::gen_constant(dst, value as u128, ty.unwrap(), |ty| {
1501                 ctx.alloc_tmp(ty).only_reg().unwrap()
1502             }) {
1503                 ctx.emit(inst);
1504             }
1505         }
1506 
1507         Opcode::Iadd
1508         | Opcode::IaddIfcout
1509         | Opcode::SaddSat
1510         | Opcode::UaddSat
1511         | Opcode::Isub
1512         | Opcode::SsubSat
1513         | Opcode::UsubSat
1514         | Opcode::AvgRound
1515         | Opcode::Band
1516         | Opcode::Bor
1517         | Opcode::Bxor => {
1518             let ty = ty.unwrap();
1519             if ty.lane_count() > 1 {
1520                 let sse_op = match op {
1521                     Opcode::Iadd => match ty {
1522                         types::I8X16 => SseOpcode::Paddb,
1523                         types::I16X8 => SseOpcode::Paddw,
1524                         types::I32X4 => SseOpcode::Paddd,
1525                         types::I64X2 => SseOpcode::Paddq,
1526                         _ => panic!("Unsupported type for packed iadd instruction: {}", ty),
1527                     },
1528                     Opcode::SaddSat => match ty {
1529                         types::I8X16 => SseOpcode::Paddsb,
1530                         types::I16X8 => SseOpcode::Paddsw,
1531                         _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty),
1532                     },
1533                     Opcode::UaddSat => match ty {
1534                         types::I8X16 => SseOpcode::Paddusb,
1535                         types::I16X8 => SseOpcode::Paddusw,
1536                         _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty),
1537                     },
1538                     Opcode::Isub => match ty {
1539                         types::I8X16 => SseOpcode::Psubb,
1540                         types::I16X8 => SseOpcode::Psubw,
1541                         types::I32X4 => SseOpcode::Psubd,
1542                         types::I64X2 => SseOpcode::Psubq,
1543                         _ => panic!("Unsupported type for packed isub instruction: {}", ty),
1544                     },
1545                     Opcode::SsubSat => match ty {
1546                         types::I8X16 => SseOpcode::Psubsb,
1547                         types::I16X8 => SseOpcode::Psubsw,
1548                         _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty),
1549                     },
1550                     Opcode::UsubSat => match ty {
1551                         types::I8X16 => SseOpcode::Psubusb,
1552                         types::I16X8 => SseOpcode::Psubusw,
1553                         _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
1554                     },
1555                     Opcode::AvgRound => match ty {
1556                         types::I8X16 => SseOpcode::Pavgb,
1557                         types::I16X8 => SseOpcode::Pavgw,
1558                         _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
1559                     },
1560                     Opcode::Band => match ty {
1561                         types::F32X4 => SseOpcode::Andps,
1562                         types::F64X2 => SseOpcode::Andpd,
1563                         _ => SseOpcode::Pand,
1564                     },
1565                     Opcode::Bor => match ty {
1566                         types::F32X4 => SseOpcode::Orps,
1567                         types::F64X2 => SseOpcode::Orpd,
1568                         _ => SseOpcode::Por,
1569                     },
1570                     Opcode::Bxor => match ty {
1571                         types::F32X4 => SseOpcode::Xorps,
1572                         types::F64X2 => SseOpcode::Xorpd,
1573                         _ => SseOpcode::Pxor,
1574                     },
1575                     _ => panic!("Unsupported packed instruction: {}", op),
1576                 };
1577                 let lhs = put_input_in_reg(ctx, inputs[0]);
1578                 let rhs = input_to_reg_mem(ctx, inputs[1]);
1579                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1580 
1581                 // Move the `lhs` to the same register as `dst`.
1582                 ctx.emit(Inst::gen_move(dst, lhs, ty));
1583                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1584             } else if ty == types::I128 || ty == types::B128 {
1585                 let alu_ops = match op {
1586                     Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
1587                     Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
1588                     Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
1589                     Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
1590                     Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
1591                     _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
1592                 };
1593                 let lhs = put_input_in_regs(ctx, inputs[0]);
1594                 let rhs = put_input_in_regs(ctx, inputs[1]);
1595                 let dst = get_output_reg(ctx, outputs[0]);
1596                 assert_eq!(lhs.len(), 2);
1597                 assert_eq!(rhs.len(), 2);
1598                 assert_eq!(dst.len(), 2);
1599 
1600                 // For add, sub, and, or, xor: just do ops on lower then upper
1601                 // half. Carry-flag propagation is implicit (add/adc, sub/sbb).
1602                 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
1603                 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
1604                 ctx.emit(Inst::alu_rmi_r(
1605                     OperandSize::Size64,
1606                     alu_ops.0,
1607                     RegMemImm::reg(rhs.regs()[0]),
1608                     dst.regs()[0],
1609                 ));
1610                 ctx.emit(Inst::alu_rmi_r(
1611                     OperandSize::Size64,
1612                     alu_ops.1,
1613                     RegMemImm::reg(rhs.regs()[1]),
1614                     dst.regs()[1],
1615                 ));
1616             } else {
1617                 let size = if ty == types::I64 {
1618                     OperandSize::Size64
1619                 } else {
1620                     OperandSize::Size32
1621                 };
1622                 let alu_op = match op {
1623                     Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
1624                     Opcode::Isub => AluRmiROpcode::Sub,
1625                     Opcode::Band => AluRmiROpcode::And,
1626                     Opcode::Bor => AluRmiROpcode::Or,
1627                     Opcode::Bxor => AluRmiROpcode::Xor,
1628                     _ => unreachable!(),
1629                 };
1630 
1631                 let (lhs, rhs) = match op {
1632                     Opcode::Iadd
1633                     | Opcode::IaddIfcout
1634                     | Opcode::Band
1635                     | Opcode::Bor
1636                     | Opcode::Bxor => {
1637                         // For commutative operations, try to commute operands if one is an
1638                         // immediate or direct memory reference. Do so by converting LHS to RMI; if
1639                         // reg, then always convert RHS to RMI; else, use LHS as RMI and convert
1640                         // RHS to reg.
1641                         let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
1642                         if let RegMemImm::Reg { reg: lhs_reg } = lhs {
1643                             let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
1644                             (lhs_reg, rhs)
1645                         } else {
1646                             let rhs_reg = put_input_in_reg(ctx, inputs[1]);
1647                             (rhs_reg, lhs)
1648                         }
1649                     }
1650                     Opcode::Isub => (
1651                         put_input_in_reg(ctx, inputs[0]),
1652                         input_to_reg_mem_imm(ctx, inputs[1]),
1653                     ),
1654                     _ => unreachable!(),
1655                 };
1656 
1657                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1658                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
1659                 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
1660             }
1661         }
1662 
1663         Opcode::Imul => {
1664             let ty = ty.unwrap();
1665             if ty == types::I64X2 {
1666                 // Eventually one of these should be `input_to_reg_mem` (TODO).
1667                 let lhs = put_input_in_reg(ctx, inputs[0]);
1668                 let rhs = put_input_in_reg(ctx, inputs[1]);
1669                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1670 
1671                 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
1672                     // With the right AVX512 features (VL, DQ) this operation
1673                     // can lower to a single operation.
1674                     ctx.emit(Inst::xmm_rm_r_evex(
1675                         Avx512Opcode::Vpmullq,
1676                         RegMem::reg(rhs),
1677                         lhs,
1678                         dst,
1679                     ));
1680                 } else {
1681                     // Otherwise, for I64X2 multiplication we describe a lane A as being
1682                     // composed of a 32-bit upper half "Ah" and a 32-bit lower half
1683                     // "Al". The 32-bit long hand multiplication can then be written
1684                     // as:
1685                     //    Ah Al
1686                     // *  Bh Bl
1687                     //    -----
1688                     //    Al * Bl
1689                     // + (Ah * Bl) << 32
1690                     // + (Al * Bh) << 32
1691                     //
1692                     // So for each lane we will compute:
1693                     //   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
1694                     //
1695                     // Note, the algorithm will use pmuldq which operates directly
1696                     // on the lower 32-bit (Al or Bl) of a lane and writes the
1697                     // result to the full 64-bits of the lane of the destination.
1698                     // For this reason we don't need shifts to isolate the lower
1699                     // 32-bits, however, we will need to use shifts to isolate the
1700                     // high 32-bits when doing calculations, i.e., Ah == A >> 32.
1701                     //
1702                     // The full sequence then is as follows:
1703                     //   A' = A
1704                     //   A' = A' >> 32
1705                     //   A' = Ah' * Bl
1706                     //   B' = B
1707                     //   B' = B' >> 32
1708                     //   B' = Bh' * Al
1709                     //   B' = B' + A'
1710                     //   B' = B' << 32
1711                     //   A' = A
1712                     //   A' = Al' * Bl
1713                     //   A' = A' + B'
1714                     //   dst = A'
1715 
1716                     // A' = A
1717                     let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
1718                     ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
1719 
1720                     // A' = A' >> 32
1721                     // A' = Ah' * Bl
1722                     ctx.emit(Inst::xmm_rmi_reg(
1723                         SseOpcode::Psrlq,
1724                         RegMemImm::imm(32),
1725                         rhs_1,
1726                     ));
1727                     ctx.emit(Inst::xmm_rm_r(
1728                         SseOpcode::Pmuludq,
1729                         RegMem::reg(lhs.clone()),
1730                         rhs_1,
1731                     ));
1732 
1733                     // B' = B
1734                     let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
1735                     ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
1736 
1737                     // B' = B' >> 32
1738                     // B' = Bh' * Al
1739                     ctx.emit(Inst::xmm_rmi_reg(
1740                         SseOpcode::Psrlq,
1741                         RegMemImm::imm(32),
1742                         lhs_1,
1743                     ));
1744                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
1745 
1746                     // B' = B' + A'
1747                     // B' = B' << 32
1748                     ctx.emit(Inst::xmm_rm_r(
1749                         SseOpcode::Paddq,
1750                         RegMem::reg(rhs_1.to_reg()),
1751                         lhs_1,
1752                     ));
1753                     ctx.emit(Inst::xmm_rmi_reg(
1754                         SseOpcode::Psllq,
1755                         RegMemImm::imm(32),
1756                         lhs_1,
1757                     ));
1758 
1759                     // A' = A
1760                     // A' = Al' * Bl
1761                     // A' = A' + B'
1762                     // dst = A'
1763                     ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
1764                     ctx.emit(Inst::xmm_rm_r(
1765                         SseOpcode::Pmuludq,
1766                         RegMem::reg(lhs.clone()),
1767                         rhs_1,
1768                     ));
1769                     ctx.emit(Inst::xmm_rm_r(
1770                         SseOpcode::Paddq,
1771                         RegMem::reg(lhs_1.to_reg()),
1772                         rhs_1,
1773                     ));
1774                     ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
1775                 }
1776             } else if ty.lane_count() > 1 {
1777                 // Emit single instruction lowerings for the remaining vector
1778                 // multiplications.
1779                 let sse_op = match ty {
1780                     types::I16X8 => SseOpcode::Pmullw,
1781                     types::I32X4 => SseOpcode::Pmulld,
1782                     _ => panic!("Unsupported type for packed imul instruction: {}", ty),
1783                 };
1784                 let lhs = put_input_in_reg(ctx, inputs[0]);
1785                 let rhs = input_to_reg_mem(ctx, inputs[1]);
1786                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1787 
1788                 // Move the `lhs` to the same register as `dst`.
1789                 ctx.emit(Inst::gen_move(dst, lhs, ty));
1790                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1791             } else if ty == types::I128 || ty == types::B128 {
1792                 // Handle 128-bit multiplications.
1793                 let lhs = put_input_in_regs(ctx, inputs[0]);
1794                 let rhs = put_input_in_regs(ctx, inputs[1]);
1795                 let dst = get_output_reg(ctx, outputs[0]);
1796                 assert_eq!(lhs.len(), 2);
1797                 assert_eq!(rhs.len(), 2);
1798                 assert_eq!(dst.len(), 2);
1799 
1800                 // mul:
1801                 //   dst_lo = lhs_lo * rhs_lo
1802                 //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
1803                 //
1804                 // so we emit:
1805                 //   mov dst_lo, lhs_lo
1806                 //   mul dst_lo, rhs_lo
1807                 //   mov dst_hi, lhs_lo
1808                 //   mul dst_hi, rhs_hi
1809                 //   mov tmp, lhs_hi
1810                 //   mul tmp, rhs_lo
1811                 //   add dst_hi, tmp
1812                 //   mov rax, lhs_lo
1813                 //   umulhi rhs_lo  // implicit rax arg/dst
1814                 //   add dst_hi, rax
1815                 let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1816                 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
1817                 ctx.emit(Inst::alu_rmi_r(
1818                     OperandSize::Size64,
1819                     AluRmiROpcode::Mul,
1820                     RegMemImm::reg(rhs.regs()[0]),
1821                     dst.regs()[0],
1822                 ));
1823                 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
1824                 ctx.emit(Inst::alu_rmi_r(
1825                     OperandSize::Size64,
1826                     AluRmiROpcode::Mul,
1827                     RegMemImm::reg(rhs.regs()[1]),
1828                     dst.regs()[1],
1829                 ));
1830                 ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
1831                 ctx.emit(Inst::alu_rmi_r(
1832                     OperandSize::Size64,
1833                     AluRmiROpcode::Mul,
1834                     RegMemImm::reg(rhs.regs()[0]),
1835                     tmp,
1836                 ));
1837                 ctx.emit(Inst::alu_rmi_r(
1838                     OperandSize::Size64,
1839                     AluRmiROpcode::Add,
1840                     RegMemImm::reg(tmp.to_reg()),
1841                     dst.regs()[1],
1842                 ));
1843                 ctx.emit(Inst::gen_move(
1844                     Writable::from_reg(regs::rax()),
1845                     lhs.regs()[0],
1846                     types::I64,
1847                 ));
1848                 ctx.emit(Inst::mul_hi(
1849                     OperandSize::Size64,
1850                     /* signed = */ false,
1851                     RegMem::reg(rhs.regs()[0]),
1852                 ));
1853                 ctx.emit(Inst::alu_rmi_r(
1854                     OperandSize::Size64,
1855                     AluRmiROpcode::Add,
1856                     RegMemImm::reg(regs::rdx()),
1857                     dst.regs()[1],
1858                 ));
1859             } else {
1860                 let size = if ty == types::I64 {
1861                     OperandSize::Size64
1862                 } else {
1863                     OperandSize::Size32
1864                 };
1865                 let alu_op = AluRmiROpcode::Mul;
1866 
1867                 // For commutative operations, try to commute operands if one is
1868                 // an immediate or direct memory reference. Do so by converting
1869                 // LHS to RMI; if reg, then always convert RHS to RMI; else, use
1870                 // LHS as RMI and convert RHS to reg.
1871                 let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
1872                 let (lhs, rhs) = if let RegMemImm::Reg { reg: lhs_reg } = lhs {
1873                     let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
1874                     (lhs_reg, rhs)
1875                 } else {
1876                     let rhs_reg = put_input_in_reg(ctx, inputs[1]);
1877                     (rhs_reg, lhs)
1878                 };
1879 
1880                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1881                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
1882                 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
1883             }
1884         }
1885 
1886         Opcode::BandNot => {
1887             let ty = ty.unwrap();
1888             debug_assert!(ty.is_vector() && ty.bytes() == 16);
1889             let lhs = input_to_reg_mem(ctx, inputs[0]);
1890             let rhs = put_input_in_reg(ctx, inputs[1]);
1891             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1892             let sse_op = match ty {
1893                 types::F32X4 => SseOpcode::Andnps,
1894                 types::F64X2 => SseOpcode::Andnpd,
1895                 _ => SseOpcode::Pandn,
1896             };
1897             // Note the flipping of operands: the `rhs` operand is used as the destination instead
1898             // of the `lhs` as in the other bit operations above (e.g. `band`).
1899             ctx.emit(Inst::gen_move(dst, rhs, ty));
1900             ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
1901         }
1902 
1903         Opcode::Iabs => {
1904             let src = input_to_reg_mem(ctx, inputs[0]);
1905             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1906             let ty = ty.unwrap();
1907             if ty == types::I64X2 {
1908                 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
1909                     ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst));
1910                 } else {
1911                     // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to
1912                     // contain the results of `0 - src` and then blend in those results with
1913                     // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or,
1914                     // conversely, if `src` was originally positive).
1915 
1916                     // Emit all 0s into the `tmp` register.
1917                     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1918                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
1919                     // Subtract the lanes from 0 and set up `dst`.
1920                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp));
1921                     ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty));
1922                     // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics
1923                     // require the "choice" mask to be in XMM0.
1924                     ctx.emit(Inst::gen_move(
1925                         Writable::from_reg(regs::xmm0()),
1926                         tmp.to_reg(),
1927                         ty,
1928                     ));
1929                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
1930                 }
1931             } else if ty.is_vector() {
1932                 let opcode = match ty {
1933                     types::I8X16 => SseOpcode::Pabsb,
1934                     types::I16X8 => SseOpcode::Pabsw,
1935                     types::I32X4 => SseOpcode::Pabsd,
1936                     _ => panic!("Unsupported type for packed iabs instruction: {}", ty),
1937                 };
1938                 ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst));
1939             } else {
1940                 unimplemented!("iabs is unimplemented for non-vector type: {}", ty);
1941             }
1942         }
1943 
1944         Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
1945             let lhs = put_input_in_reg(ctx, inputs[0]);
1946             let rhs = input_to_reg_mem(ctx, inputs[1]);
1947             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1948             let ty = ty.unwrap();
1949             if ty.is_vector() {
1950                 let sse_op = match op {
1951                     Opcode::Imax => match ty {
1952                         types::I8X16 => SseOpcode::Pmaxsb,
1953                         types::I16X8 => SseOpcode::Pmaxsw,
1954                         types::I32X4 => SseOpcode::Pmaxsd,
1955                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1956                     },
1957                     Opcode::Umax => match ty {
1958                         types::I8X16 => SseOpcode::Pmaxub,
1959                         types::I16X8 => SseOpcode::Pmaxuw,
1960                         types::I32X4 => SseOpcode::Pmaxud,
1961                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1962                     },
1963                     Opcode::Imin => match ty {
1964                         types::I8X16 => SseOpcode::Pminsb,
1965                         types::I16X8 => SseOpcode::Pminsw,
1966                         types::I32X4 => SseOpcode::Pminsd,
1967                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1968                     },
1969                     Opcode::Umin => match ty {
1970                         types::I8X16 => SseOpcode::Pminub,
1971                         types::I16X8 => SseOpcode::Pminuw,
1972                         types::I32X4 => SseOpcode::Pminud,
1973                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1974                     },
1975                     _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."),
1976                 };
1977 
1978                 // Move the `lhs` to the same register as `dst`.
1979                 ctx.emit(Inst::gen_move(dst, lhs, ty));
1980                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1981             } else {
1982                 panic!("Unsupported type for {} instruction: {}", op, ty);
1983             }
1984         }
1985 
1986         Opcode::Bnot => {
1987             let ty = ty.unwrap();
1988 
1989             if ty.is_vector() {
1990                 let src = put_input_in_reg(ctx, inputs[0]);
1991                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1992                 ctx.emit(Inst::gen_move(dst, src, ty));
1993                 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1994                 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
1995                 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
1996             } else if ty == types::I128 || ty == types::B128 {
1997                 let src = put_input_in_regs(ctx, inputs[0]);
1998                 let dst = get_output_reg(ctx, outputs[0]);
1999                 ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64));
2000                 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[0]));
2001                 ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64));
2002                 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[1]));
2003             } else if ty.is_bool() {
2004                 unimplemented!("bool bnot")
2005             } else {
2006                 let src = put_input_in_reg(ctx, inputs[0]);
2007                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2008                 ctx.emit(Inst::gen_move(dst, src, ty));
2009                 ctx.emit(Inst::not(OperandSize::from_ty(ty), dst));
2010             }
2011         }
2012 
2013         Opcode::Bitselect => {
2014             let ty = ty.unwrap();
2015             let condition = put_input_in_reg(ctx, inputs[0]);
2016             let if_true = put_input_in_reg(ctx, inputs[1]);
2017             let if_false = input_to_reg_mem(ctx, inputs[2]);
2018             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2019 
2020             if ty.is_vector() {
2021                 let tmp1 = ctx.alloc_tmp(ty).only_reg().unwrap();
2022                 ctx.emit(Inst::gen_move(tmp1, if_true, ty));
2023                 ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
2024 
2025                 let tmp2 = ctx.alloc_tmp(ty).only_reg().unwrap();
2026                 ctx.emit(Inst::gen_move(tmp2, condition, ty));
2027                 ctx.emit(Inst::and_not(ty, if_false, tmp2));
2028 
2029                 ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
2030                 ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
2031             } else {
2032                 unimplemented!("no lowering for scalar bitselect instruction")
2033             }
2034         }
2035 
2036         Opcode::Vselect => {
2037             let ty = ty.unwrap();
2038             let condition = put_input_in_reg(ctx, inputs[0]);
2039             let condition_ty = ctx.input_ty(insn, 0);
2040             let if_true = input_to_reg_mem(ctx, inputs[1]);
2041             let if_false = put_input_in_reg(ctx, inputs[2]);
2042             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2043 
2044             if ty.is_vector() {
2045                 // `vselect` relies on the bit representation of the condition:
2046                 // vector boolean types are defined in Cranelift to be all 1s or
2047                 // all 0s. This lowering relies on that fact to use x86's
2048                 // variable blend instructions, which look at the _high_bit_ of
2049                 // the condition mask. All the bits of vector booleans will
2050                 // match (all 1s or all 0s), so we can just use the high bit.
2051                 assert!(condition_ty.lane_type().is_bool());
2052 
2053                 // Variable blend instructions expect the condition mask to be
2054                 // in XMM0.
2055                 let xmm0 = Writable::from_reg(regs::xmm0());
2056                 ctx.emit(Inst::gen_move(xmm0, condition, ty));
2057 
2058                 // Match up the source and destination registers for regalloc.
2059                 ctx.emit(Inst::gen_move(dst, if_false, ty));
2060 
2061                 // Technically PBLENDVB would work in all cases (since the bytes
2062                 // inside the mask will be all 1s or 0s we can blend
2063                 // byte-by-byte instead of word-by-word, e.g.) but
2064                 // type-specialized versions are included here for clarity when
2065                 // troubleshooting and due to slight improvements in
2066                 // latency/throughput on certain processor families.
2067                 let opcode = match condition_ty {
2068                     types::B64X2 => SseOpcode::Blendvpd,
2069                     types::B32X4 => SseOpcode::Blendvps,
2070                     types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
2071                     _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
2072                 };
2073                 ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
2074             } else {
2075                 unimplemented!("no lowering for scalar vselect instruction")
2076             }
2077         }
2078 
2079         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
2080             let dst_ty = ctx.output_ty(insn, 0);
2081             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
2082 
2083             if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
2084                 // Scalar shifts on x86 have various encodings:
2085                 // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
2086                 // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
2087                 // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
2088                 // This implementation uses the last two encoding methods.
2089                 let (size, lhs) = match dst_ty {
2090                     types::I8 | types::I16 => match op {
2091                         Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
2092                         Opcode::Ushr => (
2093                             OperandSize::Size32,
2094                             extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
2095                         ),
2096                         Opcode::Sshr => (
2097                             OperandSize::Size32,
2098                             extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
2099                         ),
2100                         Opcode::Rotl | Opcode::Rotr => (
2101                             OperandSize::from_ty(dst_ty),
2102                             put_input_in_reg(ctx, inputs[0]),
2103                         ),
2104                         _ => unreachable!(),
2105                     },
2106                     types::I32 | types::I64 => (
2107                         OperandSize::from_ty(dst_ty),
2108                         put_input_in_reg(ctx, inputs[0]),
2109                     ),
2110                     _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
2111                 };
2112 
2113                 let (count, rhs) =
2114                     if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
2115                         // Mask count, according to Cranelift's semantics.
2116                         let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
2117                         (Some(cst), None)
2118                     } else {
2119                         // We can ignore upper registers if shift amount is multi-reg, because we
2120                         // are taking the shift amount mod 2^(lhs_width) anyway.
2121                         (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
2122                     };
2123 
2124                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2125 
2126                 let shift_kind = match op {
2127                     Opcode::Ishl => ShiftKind::ShiftLeft,
2128                     Opcode::Ushr => ShiftKind::ShiftRightLogical,
2129                     Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
2130                     Opcode::Rotl => ShiftKind::RotateLeft,
2131                     Opcode::Rotr => ShiftKind::RotateRight,
2132                     _ => unreachable!(),
2133                 };
2134 
2135                 let w_rcx = Writable::from_reg(regs::rcx());
2136                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
2137                 if count.is_none() {
2138                     ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
2139                 }
2140                 ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
2141             } else if dst_ty == types::I128 {
2142                 let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
2143                 let src = put_input_in_regs(ctx, inputs[0]);
2144                 let dst = get_output_reg(ctx, outputs[0]);
2145 
2146                 match op {
2147                     Opcode::Ishl => {
2148                         emit_shl_i128(ctx, src, dst, amt_src);
2149                     }
2150                     Opcode::Ushr => {
2151                         emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
2152                     }
2153                     Opcode::Sshr => {
2154                         emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
2155                     }
2156                     Opcode::Rotl => {
2157                         // (mov tmp, src)
2158                         // (shl.i128 tmp, amt)
2159                         // (mov dst, src)
2160                         // (ushr.i128 dst, 128-amt)
2161                         // (or dst, tmp)
2162                         let tmp = ctx.alloc_tmp(types::I128);
2163                         emit_shl_i128(ctx, src, tmp, amt_src);
2164                         let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2165                         ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
2166                         ctx.emit(Inst::alu_rmi_r(
2167                             OperandSize::Size64,
2168                             AluRmiROpcode::Sub,
2169                             RegMemImm::reg(amt_src),
2170                             inv_amt,
2171                         ));
2172                         emit_shr_i128(
2173                             ctx,
2174                             src,
2175                             dst,
2176                             inv_amt.to_reg(),
2177                             /* is_signed = */ false,
2178                         );
2179                         ctx.emit(Inst::alu_rmi_r(
2180                             OperandSize::Size64,
2181                             AluRmiROpcode::Or,
2182                             RegMemImm::reg(tmp.regs()[0].to_reg()),
2183                             dst.regs()[0],
2184                         ));
2185                         ctx.emit(Inst::alu_rmi_r(
2186                             OperandSize::Size64,
2187                             AluRmiROpcode::Or,
2188                             RegMemImm::reg(tmp.regs()[1].to_reg()),
2189                             dst.regs()[1],
2190                         ));
2191                     }
2192                     Opcode::Rotr => {
2193                         // (mov tmp, src)
2194                         // (ushr.i128 tmp, amt)
2195                         // (mov dst, src)
2196                         // (shl.i128 dst, 128-amt)
2197                         // (or dst, tmp)
2198                         let tmp = ctx.alloc_tmp(types::I128);
2199                         emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
2200                         let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2201                         ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
2202                         ctx.emit(Inst::alu_rmi_r(
2203                             OperandSize::Size64,
2204                             AluRmiROpcode::Sub,
2205                             RegMemImm::reg(amt_src),
2206                             inv_amt,
2207                         ));
2208                         emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
2209                         ctx.emit(Inst::alu_rmi_r(
2210                             OperandSize::Size64,
2211                             AluRmiROpcode::Or,
2212                             RegMemImm::reg(tmp.regs()[0].to_reg()),
2213                             dst.regs()[0],
2214                         ));
2215                         ctx.emit(Inst::alu_rmi_r(
2216                             OperandSize::Size64,
2217                             AluRmiROpcode::Or,
2218                             RegMemImm::reg(tmp.regs()[1].to_reg()),
2219                             dst.regs()[1],
2220                         ));
2221                     }
2222                     _ => unreachable!(),
2223                 }
2224             } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
2225                 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
2226                 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
2227                 // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
2228                 // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
2229                 let src = put_input_in_reg(ctx, inputs[0]);
2230                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2231                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2232 
2233                 // If necessary, move the shift index into the lowest bits of a vector register.
2234                 let shift_by_moved = match &shift_by {
2235                     RegMemImm::Imm { .. } => shift_by.clone(),
2236                     RegMemImm::Reg { reg } => {
2237                         let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2238                         ctx.emit(Inst::gpr_to_xmm(
2239                             SseOpcode::Movd,
2240                             RegMem::reg(*reg),
2241                             OperandSize::Size32,
2242                             tmp_shift_by,
2243                         ));
2244                         RegMemImm::reg(tmp_shift_by.to_reg())
2245                     }
2246                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2247                 };
2248 
2249                 // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
2250                 // the others must be fixed up with the mask below.
2251                 let shift_opcode = match op {
2252                     Opcode::Ishl => SseOpcode::Psllw,
2253                     Opcode::Ushr => SseOpcode::Psrlw,
2254                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2255                 };
2256                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2257                 ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst));
2258 
2259                 // Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix
2260                 // up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt
2261                 // forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement),
2262                 // we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all
2263                 // bits except the MSB so we mask with `0x7f`; etc.
2264                 const USHR_MASKS: [u8; 128] = [
2265                     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2266                     0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
2267                     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
2268                     0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f,
2269                     0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f,
2270                     0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
2271                     0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
2272                     0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
2273                     0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
2274                     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2275                 ];
2276                 const SHL_MASKS: [u8; 128] = [
2277                     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2278                     0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
2279                     0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
2280                     0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
2281                     0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
2282                     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
2283                     0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
2284                     0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
2285                     0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
2286                     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2287                 ];
2288                 let mask = match op {
2289                     Opcode::Ishl => &SHL_MASKS,
2290                     Opcode::Ushr => &USHR_MASKS,
2291                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2292                 };
2293 
2294                 // Figure out the address of the shift mask.
2295                 let mask_address = match shift_by {
2296                     RegMemImm::Imm { simm32 } => {
2297                         // When the shift amount is known, we can statically (i.e. at compile time) determine the mask to
2298                         // use and only emit that.
2299                         debug_assert!(simm32 < 8);
2300                         let mask_offset = simm32 as usize * 16;
2301                         let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(
2302                             &mask[mask_offset..mask_offset + 16],
2303                         ));
2304                         SyntheticAmode::ConstantOffset(mask_constant)
2305                     }
2306                     RegMemImm::Reg { reg } => {
2307                         // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
2308                         // mask offset in the table. We do this use LEA to find the base address of the mask table and then
2309                         // complex addressing to offset to the right mask: `base_address + shift_by * 4`
2310                         let base_mask_address = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2311                         let mask_offset = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2312                         let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
2313                         ctx.emit(Inst::lea(
2314                             SyntheticAmode::ConstantOffset(mask_constant),
2315                             base_mask_address,
2316                         ));
2317                         ctx.emit(Inst::gen_move(mask_offset, reg, types::I64));
2318                         ctx.emit(Inst::shift_r(
2319                             OperandSize::Size64,
2320                             ShiftKind::ShiftLeft,
2321                             Some(4),
2322                             mask_offset,
2323                         ));
2324                         Amode::imm_reg_reg_shift(
2325                             0,
2326                             base_mask_address.to_reg(),
2327                             mask_offset.to_reg(),
2328                             0,
2329                         )
2330                         .into()
2331                     }
2332                     RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"),
2333                 };
2334 
2335                 // Load the mask into a temporary register, `mask_value`.
2336                 let mask_value = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2337                 ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));
2338 
2339                 // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
2340                 // this AND instruction could be coalesced with the load above.
2341                 let sse_op = match dst_ty {
2342                     types::F32X4 => SseOpcode::Andps,
2343                     types::F64X2 => SseOpcode::Andpd,
2344                     _ => SseOpcode::Pand,
2345                 };
2346                 ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
2347             } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
2348                 // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
2349                 // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
2350                 // approach here: separate the low and high lanes, shift them separately, and merge them into the final
2351                 // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
2352                 //   low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2353                 //   shifted_low.i16x8 = shift each lane of `low`
2354                 //   high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2355                 //   shifted_high.i16x8 = shift each lane of `high`
2356                 //   dst.i8x16 = [s0'', s1'', ..., s15'']
2357                 let src = put_input_in_reg(ctx, inputs[0]);
2358                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2359                 let shift_by_ty = ctx.input_ty(insn, 1);
2360                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2361 
2362                 // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
2363                 // bits, relying on PSRAW to fill in the upper bits appropriately.
2364                 let bigger_shift_by = match shift_by {
2365                     // When we know the shift amount at compile time, we add the extra shift amount statically.
2366                     RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
2367                     // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
2368                     // register.
2369                     RegMemImm::Reg { reg } => {
2370                         let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
2371                         ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
2372 
2373                         let size = if shift_by_ty == types::I64 {
2374                             OperandSize::Size64
2375                         } else {
2376                             OperandSize::Size32
2377                         };
2378                         let imm = RegMemImm::imm(8);
2379                         ctx.emit(Inst::alu_rmi_r(
2380                             size,
2381                             AluRmiROpcode::Add,
2382                             imm,
2383                             bigger_shift_by_gpr,
2384                         ));
2385 
2386                         let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2387                         ctx.emit(Inst::gpr_to_xmm(
2388                             SseOpcode::Movd,
2389                             RegMem::from(bigger_shift_by_gpr),
2390                             OperandSize::Size32,
2391                             bigger_shift_by_xmm,
2392                         ));
2393                         RegMemImm::reg(bigger_shift_by_xmm.to_reg())
2394                     }
2395                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2396                 };
2397 
2398                 // Unpack and shift the lower lanes of `src` into the `dst` register.
2399                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2400                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
2401                 ctx.emit(Inst::xmm_rmi_reg(
2402                     SseOpcode::Psraw,
2403                     bigger_shift_by.clone(),
2404                     dst,
2405                 ));
2406 
2407                 // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
2408                 let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2409                 ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
2410                 ctx.emit(Inst::xmm_rm_r(
2411                     SseOpcode::Punpckhbw,
2412                     RegMem::from(upper_lanes),
2413                     upper_lanes,
2414                 ));
2415                 ctx.emit(Inst::xmm_rmi_reg(
2416                     SseOpcode::Psraw,
2417                     bigger_shift_by,
2418                     upper_lanes,
2419                 ));
2420 
2421                 // Merge the upper and lower shifted lanes into `dst`.
2422                 ctx.emit(Inst::xmm_rm_r(
2423                     SseOpcode::Packsswb,
2424                     RegMem::from(upper_lanes),
2425                     dst,
2426                 ));
2427             } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
2428                 // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
2429                 // like AVX512VL and AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
2430                 // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
2431                 // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
2432                 // scalar instruction, and insert the shifted values back in the `dst` XMM register.
2433                 let src = put_input_in_reg(ctx, inputs[0]);
2434                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2435                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2436 
2437                 // Extract the upper and lower lanes into temporary GPRs.
2438                 let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2439                 emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
2440                 let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2441                 emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
2442 
2443                 // Shift each value.
2444                 let mut shift = |reg: Writable<Reg>| {
2445                     let kind = ShiftKind::ShiftRightArithmetic;
2446                     if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
2447                         // Mask the shift amount according to Cranelift's semantics.
2448                         let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
2449                         ctx.emit(Inst::shift_r(
2450                             OperandSize::Size64,
2451                             kind,
2452                             Some(shift_by),
2453                             reg,
2454                         ));
2455                     } else {
2456                         let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
2457                         let w_rcx = Writable::from_reg(regs::rcx());
2458                         ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
2459                         ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
2460                     };
2461                 };
2462                 shift(lower_lane);
2463                 shift(upper_lane);
2464 
2465                 // Insert the scalar values back into the `dst` vector.
2466                 emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
2467                 emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
2468             } else {
2469                 // For the remaining packed shifts not covered above, x86 has implementations that can either:
2470                 // - shift using an immediate
2471                 // - shift using a dynamic value given in the lower bits of another XMM register.
2472                 let src = put_input_in_reg(ctx, inputs[0]);
2473                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2474                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2475                 let sse_op = match dst_ty {
2476                     types::I16X8 => match op {
2477                         Opcode::Ishl => SseOpcode::Psllw,
2478                         Opcode::Ushr => SseOpcode::Psrlw,
2479                         Opcode::Sshr => SseOpcode::Psraw,
2480                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2481                     },
2482                     types::I32X4 => match op {
2483                         Opcode::Ishl => SseOpcode::Pslld,
2484                         Opcode::Ushr => SseOpcode::Psrld,
2485                         Opcode::Sshr => SseOpcode::Psrad,
2486                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2487                     },
2488                     types::I64X2 => match op {
2489                         Opcode::Ishl => SseOpcode::Psllq,
2490                         Opcode::Ushr => SseOpcode::Psrlq,
2491                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2492                     },
2493                     _ => unreachable!(),
2494                 };
2495 
2496                 // If necessary, move the shift index into the lowest bits of a vector register.
2497                 let shift_by = match shift_by {
2498                     RegMemImm::Imm { .. } => shift_by,
2499                     RegMemImm::Reg { reg } => {
2500                         let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2501                         ctx.emit(Inst::gpr_to_xmm(
2502                             SseOpcode::Movd,
2503                             RegMem::reg(reg),
2504                             OperandSize::Size32,
2505                             tmp_shift_by,
2506                         ));
2507                         RegMemImm::reg(tmp_shift_by.to_reg())
2508                     }
2509                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2510                 };
2511 
2512                 // Move the `src` to the same register as `dst`.
2513                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2514 
2515                 ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst));
2516             }
2517         }
2518 
2519         Opcode::Ineg => {
2520             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2521             let ty = ty.unwrap();
2522 
2523             if ty.is_vector() {
2524                 // Zero's out a register and then does a packed subtraction
2525                 // of the input from the register.
2526 
2527                 let src = input_to_reg_mem(ctx, inputs[0]);
2528                 let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
2529 
2530                 let subtract_opcode = match ty {
2531                     types::I8X16 => SseOpcode::Psubb,
2532                     types::I16X8 => SseOpcode::Psubw,
2533                     types::I32X4 => SseOpcode::Psubd,
2534                     types::I64X2 => SseOpcode::Psubq,
2535                     _ => panic!("Unsupported type for Ineg instruction, found {}", ty),
2536                 };
2537 
2538                 // Note we must zero out a tmp instead of using the destination register since
2539                 // the desitnation could be an alias for the source input register
2540                 ctx.emit(Inst::xmm_rm_r(
2541                     SseOpcode::Pxor,
2542                     RegMem::reg(tmp.to_reg()),
2543                     tmp,
2544                 ));
2545                 ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
2546                 ctx.emit(Inst::xmm_unary_rm_r(
2547                     SseOpcode::Movapd,
2548                     RegMem::reg(tmp.to_reg()),
2549                     dst,
2550                 ));
2551             } else {
2552                 let src = put_input_in_reg(ctx, inputs[0]);
2553                 ctx.emit(Inst::gen_move(dst, src, ty));
2554                 ctx.emit(Inst::neg(OperandSize::from_ty(ty), dst));
2555             }
2556         }
2557 
2558         Opcode::Clz => {
2559             let orig_ty = ty.unwrap();
2560 
2561             if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) {
2562                 // We can use a plain lzcnt instruction here. Note no special handling is required
2563                 // for zero inputs, because the machine instruction does what the CLIF expects for
2564                 // zero, i.e. it returns zero.
2565                 let src = input_to_reg_mem(ctx, inputs[0]);
2566                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2567                 ctx.emit(Inst::unary_rm_r(
2568                     OperandSize::from_ty(orig_ty),
2569                     UnaryRmROpcode::Lzcnt,
2570                     src,
2571                     dst,
2572                 ));
2573                 return Ok(());
2574             }
2575 
2576             // General formula using bit-scan reverse (BSR):
2577             // mov -1, %dst
2578             // bsr %src, %tmp
2579             // cmovz %dst, %tmp
2580             // mov $(size_bits - 1), %dst
2581             // sub %tmp, %dst
2582 
2583             if orig_ty == types::I128 {
2584                 // clz upper, tmp1
2585                 // clz lower, dst
2586                 // add dst, 64
2587                 // cmp tmp1, 64
2588                 // cmovnz tmp1, dst
2589                 let dsts = get_output_reg(ctx, outputs[0]);
2590                 let dst = dsts.regs()[0];
2591                 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2592                 let srcs = put_input_in_regs(ctx, inputs[0]);
2593                 let src_lo = srcs.regs()[0];
2594                 let src_hi = srcs.regs()[1];
2595                 emit_clz(ctx, types::I64, types::I64, src_hi, tmp1);
2596                 emit_clz(ctx, types::I64, types::I64, src_lo, dst);
2597                 ctx.emit(Inst::alu_rmi_r(
2598                     OperandSize::Size64,
2599                     AluRmiROpcode::Add,
2600                     RegMemImm::imm(64),
2601                     dst,
2602                 ));
2603                 ctx.emit(Inst::cmp_rmi_r(
2604                     OperandSize::Size64,
2605                     RegMemImm::imm(64),
2606                     tmp1.to_reg(),
2607                 ));
2608                 ctx.emit(Inst::cmove(
2609                     OperandSize::Size64,
2610                     CC::NZ,
2611                     RegMem::reg(tmp1.to_reg()),
2612                     dst,
2613                 ));
2614                 ctx.emit(Inst::alu_rmi_r(
2615                     OperandSize::Size64,
2616                     AluRmiROpcode::Xor,
2617                     RegMemImm::reg(dsts.regs()[1].to_reg()),
2618                     dsts.regs()[1],
2619                 ));
2620             } else {
2621                 let (ext_spec, ty) = match orig_ty {
2622                     types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
2623                     a if a == types::I32 || a == types::I64 => (None, a),
2624                     _ => unreachable!(),
2625                 };
2626                 let src = if let Some(ext_spec) = ext_spec {
2627                     extend_input_to_reg(ctx, inputs[0], ext_spec)
2628                 } else {
2629                     put_input_in_reg(ctx, inputs[0])
2630                 };
2631 
2632                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2633                 emit_clz(ctx, orig_ty, ty, src, dst);
2634             }
2635         }
2636 
2637         Opcode::Ctz => {
2638             let orig_ty = ctx.input_ty(insn, 0);
2639 
2640             if isa_flags.use_bmi1() && (orig_ty == types::I32 || orig_ty == types::I64) {
2641                 // We can use a plain tzcnt instruction here. Note no special handling is required
2642                 // for zero inputs, because the machine instruction does what the CLIF expects for
2643                 // zero, i.e. it returns zero.
2644                 let src = input_to_reg_mem(ctx, inputs[0]);
2645                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2646                 ctx.emit(Inst::unary_rm_r(
2647                     OperandSize::from_ty(orig_ty),
2648                     UnaryRmROpcode::Tzcnt,
2649                     src,
2650                     dst,
2651                 ));
2652                 return Ok(());
2653             }
2654 
2655             // General formula using bit-scan forward (BSF):
2656             // bsf %src, %dst
2657             // mov $(size_bits), %tmp
2658             // cmovz %tmp, %dst
2659             if orig_ty == types::I128 {
2660                 // ctz src_lo, dst
2661                 // ctz src_hi, tmp1
2662                 // add tmp1, 64
2663                 // cmp dst, 64
2664                 // cmovz tmp1, dst
2665                 let dsts = get_output_reg(ctx, outputs[0]);
2666                 let dst = dsts.regs()[0];
2667                 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2668                 let srcs = put_input_in_regs(ctx, inputs[0]);
2669                 let src_lo = srcs.regs()[0];
2670                 let src_hi = srcs.regs()[1];
2671                 emit_ctz(ctx, types::I64, types::I64, src_lo, dst);
2672                 emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1);
2673                 ctx.emit(Inst::alu_rmi_r(
2674                     OperandSize::Size64,
2675                     AluRmiROpcode::Add,
2676                     RegMemImm::imm(64),
2677                     tmp1,
2678                 ));
2679                 ctx.emit(Inst::cmp_rmi_r(
2680                     OperandSize::Size64,
2681                     RegMemImm::imm(64),
2682                     dst.to_reg(),
2683                 ));
2684                 ctx.emit(Inst::cmove(
2685                     OperandSize::Size64,
2686                     CC::Z,
2687                     RegMem::reg(tmp1.to_reg()),
2688                     dst,
2689                 ));
2690                 ctx.emit(Inst::alu_rmi_r(
2691                     OperandSize::Size64,
2692                     AluRmiROpcode::Xor,
2693                     RegMemImm::reg(dsts.regs()[1].to_reg()),
2694                     dsts.regs()[1],
2695                 ));
2696             } else {
2697                 let ty = if orig_ty.bits() < 32 {
2698                     types::I32
2699                 } else {
2700                     orig_ty
2701                 };
2702                 debug_assert!(ty == types::I32 || ty == types::I64);
2703 
2704                 let src = put_input_in_reg(ctx, inputs[0]);
2705                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2706                 emit_ctz(ctx, orig_ty, ty, src, dst);
2707             }
2708         }
2709 
2710         Opcode::Popcnt => {
2711             let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
2712                 types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
2713                 a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a),
2714                 _ => unreachable!(),
2715             };
2716 
2717             if isa_flags.use_popcnt() {
2718                 match ty {
2719                     types::I32 | types::I64 => {
2720                         let src = input_to_reg_mem(ctx, inputs[0]);
2721                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2722                         ctx.emit(Inst::unary_rm_r(
2723                             OperandSize::from_ty(ty),
2724                             UnaryRmROpcode::Popcnt,
2725                             src,
2726                             dst,
2727                         ));
2728                         return Ok(());
2729                     }
2730 
2731                     types::I128 => {
2732                         // The number of ones in a 128-bits value is the plain sum of the number of
2733                         // ones in its low and high parts. No risk of overflow here.
2734                         let dsts = get_output_reg(ctx, outputs[0]);
2735                         let dst = dsts.regs()[0];
2736                         let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2737                         let srcs = put_input_in_regs(ctx, inputs[0]);
2738                         let src_lo = srcs.regs()[0];
2739                         let src_hi = srcs.regs()[1];
2740 
2741                         ctx.emit(Inst::unary_rm_r(
2742                             OperandSize::Size64,
2743                             UnaryRmROpcode::Popcnt,
2744                             RegMem::reg(src_lo),
2745                             dst,
2746                         ));
2747                         ctx.emit(Inst::unary_rm_r(
2748                             OperandSize::Size64,
2749                             UnaryRmROpcode::Popcnt,
2750                             RegMem::reg(src_hi),
2751                             tmp,
2752                         ));
2753                         ctx.emit(Inst::alu_rmi_r(
2754                             OperandSize::Size64,
2755                             AluRmiROpcode::Add,
2756                             RegMemImm::reg(tmp.to_reg()),
2757                             dst,
2758                         ));
2759 
2760                         // Zero the result's high component.
2761                         ctx.emit(Inst::alu_rmi_r(
2762                             OperandSize::Size64,
2763                             AluRmiROpcode::Xor,
2764                             RegMemImm::reg(dsts.regs()[1].to_reg()),
2765                             dsts.regs()[1],
2766                         ));
2767 
2768                         return Ok(());
2769                     }
2770                     _ => {}
2771                 }
2772             }
2773 
2774             let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec {
2775                 (
2776                     smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))],
2777                     ty,
2778                 )
2779             } else if ty == types::I128 {
2780                 let regs = put_input_in_regs(ctx, inputs[0]);
2781                 (
2782                     smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])],
2783                     types::I64,
2784                 )
2785             } else {
2786                 // N.B.: explicitly put input in a reg here because the width of the instruction
2787                 // into which this RM op goes may not match the width of the input type (in fact,
2788                 // it won't for i32.popcnt), and we don't want a larger than necessary load.
2789                 (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty)
2790             };
2791 
2792             let mut dsts: SmallVec<[Reg; 2]> = smallvec![];
2793             for src in srcs {
2794                 let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2795                 dsts.push(dst.to_reg());
2796                 if ty == types::I64 {
2797                     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2798                     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2799                     let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2800 
2801                     // mov src, tmp1
2802                     ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
2803 
2804                     // shr $1, tmp1
2805                     ctx.emit(Inst::shift_r(
2806                         OperandSize::Size64,
2807                         ShiftKind::ShiftRightLogical,
2808                         Some(1),
2809                         tmp1,
2810                     ));
2811 
2812                     // mov 0x7777_7777_7777_7777, cst
2813                     ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
2814 
2815                     // andq cst, tmp1
2816                     ctx.emit(Inst::alu_rmi_r(
2817                         OperandSize::Size64,
2818                         AluRmiROpcode::And,
2819                         RegMemImm::reg(cst.to_reg()),
2820                         tmp1,
2821                     ));
2822 
2823                     // mov src, tmp2
2824                     ctx.emit(Inst::mov64_rm_r(src, tmp2));
2825 
2826                     // sub tmp1, tmp2
2827                     ctx.emit(Inst::alu_rmi_r(
2828                         OperandSize::Size64,
2829                         AluRmiROpcode::Sub,
2830                         RegMemImm::reg(tmp1.to_reg()),
2831                         tmp2,
2832                     ));
2833 
2834                     // shr $1, tmp1
2835                     ctx.emit(Inst::shift_r(
2836                         OperandSize::Size64,
2837                         ShiftKind::ShiftRightLogical,
2838                         Some(1),
2839                         tmp1,
2840                     ));
2841 
2842                     // and cst, tmp1
2843                     ctx.emit(Inst::alu_rmi_r(
2844                         OperandSize::Size64,
2845                         AluRmiROpcode::And,
2846                         RegMemImm::reg(cst.to_reg()),
2847                         tmp1,
2848                     ));
2849 
2850                     // sub tmp1, tmp2
2851                     ctx.emit(Inst::alu_rmi_r(
2852                         OperandSize::Size64,
2853                         AluRmiROpcode::Sub,
2854                         RegMemImm::reg(tmp1.to_reg()),
2855                         tmp2,
2856                     ));
2857 
2858                     // shr $1, tmp1
2859                     ctx.emit(Inst::shift_r(
2860                         OperandSize::Size64,
2861                         ShiftKind::ShiftRightLogical,
2862                         Some(1),
2863                         tmp1,
2864                     ));
2865 
2866                     // and cst, tmp1
2867                     ctx.emit(Inst::alu_rmi_r(
2868                         OperandSize::Size64,
2869                         AluRmiROpcode::And,
2870                         RegMemImm::reg(cst.to_reg()),
2871                         tmp1,
2872                     ));
2873 
2874                     // sub tmp1, tmp2
2875                     ctx.emit(Inst::alu_rmi_r(
2876                         OperandSize::Size64,
2877                         AluRmiROpcode::Sub,
2878                         RegMemImm::reg(tmp1.to_reg()),
2879                         tmp2,
2880                     ));
2881 
2882                     // mov tmp2, dst
2883                     ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
2884 
2885                     // shr $4, dst
2886                     ctx.emit(Inst::shift_r(
2887                         OperandSize::Size64,
2888                         ShiftKind::ShiftRightLogical,
2889                         Some(4),
2890                         dst,
2891                     ));
2892 
2893                     // add tmp2, dst
2894                     ctx.emit(Inst::alu_rmi_r(
2895                         OperandSize::Size64,
2896                         AluRmiROpcode::Add,
2897                         RegMemImm::reg(tmp2.to_reg()),
2898                         dst,
2899                     ));
2900 
2901                     // mov $0x0F0F_0F0F_0F0F_0F0F, cst
2902                     ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
2903 
2904                     // and cst, dst
2905                     ctx.emit(Inst::alu_rmi_r(
2906                         OperandSize::Size64,
2907                         AluRmiROpcode::And,
2908                         RegMemImm::reg(cst.to_reg()),
2909                         dst,
2910                     ));
2911 
2912                     // mov $0x0101_0101_0101_0101, cst
2913                     ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
2914 
2915                     // mul cst, dst
2916                     ctx.emit(Inst::alu_rmi_r(
2917                         OperandSize::Size64,
2918                         AluRmiROpcode::Mul,
2919                         RegMemImm::reg(cst.to_reg()),
2920                         dst,
2921                     ));
2922 
2923                     // shr $56, dst
2924                     ctx.emit(Inst::shift_r(
2925                         OperandSize::Size64,
2926                         ShiftKind::ShiftRightLogical,
2927                         Some(56),
2928                         dst,
2929                     ));
2930                 } else {
2931                     assert_eq!(ty, types::I32);
2932 
2933                     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2934                     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2935 
2936                     // mov src, tmp1
2937                     ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
2938 
2939                     // shr $1, tmp1
2940                     ctx.emit(Inst::shift_r(
2941                         OperandSize::Size32,
2942                         ShiftKind::ShiftRightLogical,
2943                         Some(1),
2944                         tmp1,
2945                     ));
2946 
2947                     // andq $0x7777_7777, tmp1
2948                     ctx.emit(Inst::alu_rmi_r(
2949                         OperandSize::Size32,
2950                         AluRmiROpcode::And,
2951                         RegMemImm::imm(0x77777777),
2952                         tmp1,
2953                     ));
2954 
2955                     // mov src, tmp2
2956                     ctx.emit(Inst::mov64_rm_r(src, tmp2));
2957 
2958                     // sub tmp1, tmp2
2959                     ctx.emit(Inst::alu_rmi_r(
2960                         OperandSize::Size32,
2961                         AluRmiROpcode::Sub,
2962                         RegMemImm::reg(tmp1.to_reg()),
2963                         tmp2,
2964                     ));
2965 
2966                     // shr $1, tmp1
2967                     ctx.emit(Inst::shift_r(
2968                         OperandSize::Size32,
2969                         ShiftKind::ShiftRightLogical,
2970                         Some(1),
2971                         tmp1,
2972                     ));
2973 
2974                     // and 0x7777_7777, tmp1
2975                     ctx.emit(Inst::alu_rmi_r(
2976                         OperandSize::Size32,
2977                         AluRmiROpcode::And,
2978                         RegMemImm::imm(0x77777777),
2979                         tmp1,
2980                     ));
2981 
2982                     // sub tmp1, tmp2
2983                     ctx.emit(Inst::alu_rmi_r(
2984                         OperandSize::Size32,
2985                         AluRmiROpcode::Sub,
2986                         RegMemImm::reg(tmp1.to_reg()),
2987                         tmp2,
2988                     ));
2989 
2990                     // shr $1, tmp1
2991                     ctx.emit(Inst::shift_r(
2992                         OperandSize::Size32,
2993                         ShiftKind::ShiftRightLogical,
2994                         Some(1),
2995                         tmp1,
2996                     ));
2997 
2998                     // and $0x7777_7777, tmp1
2999                     ctx.emit(Inst::alu_rmi_r(
3000                         OperandSize::Size32,
3001                         AluRmiROpcode::And,
3002                         RegMemImm::imm(0x77777777),
3003                         tmp1,
3004                     ));
3005 
3006                     // sub tmp1, tmp2
3007                     ctx.emit(Inst::alu_rmi_r(
3008                         OperandSize::Size32,
3009                         AluRmiROpcode::Sub,
3010                         RegMemImm::reg(tmp1.to_reg()),
3011                         tmp2,
3012                     ));
3013 
3014                     // mov tmp2, dst
3015                     ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
3016 
3017                     // shr $4, dst
3018                     ctx.emit(Inst::shift_r(
3019                         OperandSize::Size32,
3020                         ShiftKind::ShiftRightLogical,
3021                         Some(4),
3022                         dst,
3023                     ));
3024 
3025                     // add tmp2, dst
3026                     ctx.emit(Inst::alu_rmi_r(
3027                         OperandSize::Size32,
3028                         AluRmiROpcode::Add,
3029                         RegMemImm::reg(tmp2.to_reg()),
3030                         dst,
3031                     ));
3032 
3033                     // and $0x0F0F_0F0F, dst
3034                     ctx.emit(Inst::alu_rmi_r(
3035                         OperandSize::Size32,
3036                         AluRmiROpcode::And,
3037                         RegMemImm::imm(0x0F0F0F0F),
3038                         dst,
3039                     ));
3040 
3041                     // mul $0x0101_0101, dst
3042                     ctx.emit(Inst::alu_rmi_r(
3043                         OperandSize::Size32,
3044                         AluRmiROpcode::Mul,
3045                         RegMemImm::imm(0x01010101),
3046                         dst,
3047                     ));
3048 
3049                     // shr $24, dst
3050                     ctx.emit(Inst::shift_r(
3051                         OperandSize::Size32,
3052                         ShiftKind::ShiftRightLogical,
3053                         Some(24),
3054                         dst,
3055                     ));
3056                 }
3057             }
3058 
3059             if dsts.len() == 1 {
3060                 let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3061                 ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64));
3062             } else {
3063                 assert!(dsts.len() == 2);
3064                 let final_dst = get_output_reg(ctx, outputs[0]);
3065                 ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64));
3066                 ctx.emit(Inst::alu_rmi_r(
3067                     OperandSize::Size64,
3068                     AluRmiROpcode::Add,
3069                     RegMemImm::reg(dsts[1]),
3070                     final_dst.regs()[0],
3071                 ));
3072                 ctx.emit(Inst::alu_rmi_r(
3073                     OperandSize::Size64,
3074                     AluRmiROpcode::Xor,
3075                     RegMemImm::reg(final_dst.regs()[1].to_reg()),
3076                     final_dst.regs()[1],
3077                 ));
3078             }
3079         }
3080 
3081         Opcode::Bitrev => {
3082             let ty = ctx.input_ty(insn, 0);
3083             assert!(
3084                 ty == types::I8
3085                     || ty == types::I16
3086                     || ty == types::I32
3087                     || ty == types::I64
3088                     || ty == types::I128
3089             );
3090 
3091             if ty == types::I128 {
3092                 let src = put_input_in_regs(ctx, inputs[0]);
3093                 let dst = get_output_reg(ctx, outputs[0]);
3094                 emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64);
3095                 emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64);
3096             } else {
3097                 let src = put_input_in_reg(ctx, inputs[0]);
3098                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3099                 emit_bitrev(ctx, src, dst, ty);
3100             }
3101         }
3102 
3103         Opcode::IsNull | Opcode::IsInvalid => {
3104             // Null references are represented by the constant value 0; invalid references are
3105             // represented by the constant value -1. See `define_reftypes()` in
3106             // `meta/src/isa/x86/encodings.rs` to confirm.
3107             let src = put_input_in_reg(ctx, inputs[0]);
3108             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3109             let ty = ctx.input_ty(insn, 0);
3110             let imm = match op {
3111                 Opcode::IsNull => {
3112                     // TODO could use tst src, src for IsNull
3113                     0
3114                 }
3115                 Opcode::IsInvalid => {
3116                     // We can do a 32-bit comparison even in 64-bits mode, as the constant is then
3117                     // sign-extended.
3118                     0xffffffff
3119                 }
3120                 _ => unreachable!(),
3121             };
3122             ctx.emit(Inst::cmp_rmi_r(
3123                 OperandSize::from_ty(ty),
3124                 RegMemImm::imm(imm),
3125                 src,
3126             ));
3127             ctx.emit(Inst::setcc(CC::Z, dst));
3128         }
3129 
3130         Opcode::Uextend
3131         | Opcode::Sextend
3132         | Opcode::Bint
3133         | Opcode::Breduce
3134         | Opcode::Bextend
3135         | Opcode::Ireduce => {
3136             let src_ty = ctx.input_ty(insn, 0);
3137             let dst_ty = ctx.output_ty(insn, 0);
3138 
3139             if src_ty == types::I128 {
3140                 assert!(dst_ty.bits() <= 64);
3141                 assert!(op == Opcode::Ireduce);
3142                 let src = put_input_in_regs(ctx, inputs[0]);
3143                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3144                 ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64));
3145             } else if dst_ty == types::I128 {
3146                 assert!(src_ty.bits() <= 64);
3147                 let src = put_input_in_reg(ctx, inputs[0]);
3148                 let dst = get_output_reg(ctx, outputs[0]);
3149                 assert!(op == Opcode::Uextend || op == Opcode::Sextend || op == Opcode::Bint);
3150                 // Extend to 64 bits first.
3151 
3152                 let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64);
3153                 if let Some(ext_mode) = ext_mode {
3154                     if op == Opcode::Sextend {
3155                         ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
3156                     } else {
3157                         ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
3158                     }
3159                 } else {
3160                     ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0]));
3161                 }
3162 
3163                 // Now generate the top 64 bits.
3164                 if op == Opcode::Sextend {
3165                     // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits
3166                     // to spread the sign bit across all bits.
3167                     ctx.emit(Inst::gen_move(
3168                         dst.regs()[1],
3169                         dst.regs()[0].to_reg(),
3170                         types::I64,
3171                     ));
3172                     ctx.emit(Inst::shift_r(
3173                         OperandSize::Size64,
3174                         ShiftKind::ShiftRightArithmetic,
3175                         Some(63),
3176                         dst.regs()[1],
3177                     ));
3178                 } else {
3179                     // Zero-extend: just zero the top word.
3180                     ctx.emit(Inst::alu_rmi_r(
3181                         OperandSize::Size64,
3182                         AluRmiROpcode::Xor,
3183                         RegMemImm::reg(dst.regs()[1].to_reg()),
3184                         dst.regs()[1],
3185                     ));
3186                 }
3187             } else {
3188                 // Sextend requires a sign-extended move, but all the other opcodes are simply a move
3189                 // from a zero-extended source. Here is why this works, in each case:
3190                 //
3191                 // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
3192                 // zero-extend here.
3193                 //
3194                 // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
3195                 // again, this is a zero-extend / no-op.
3196                 //
3197                 // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
3198                 // high-order bits, so we can simply do a copy.
3199                 if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
3200                     // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
3201                     // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
3202                     // zero-extended move in this case.
3203                     // TODO add loads and shifts here.
3204                     if let Some(_) = matches_input_any(
3205                         ctx,
3206                         inputs[0],
3207                         &[
3208                             Opcode::Iadd,
3209                             Opcode::IaddIfcout,
3210                             Opcode::Isub,
3211                             Opcode::Imul,
3212                             Opcode::Band,
3213                             Opcode::Bor,
3214                             Opcode::Bxor,
3215                         ],
3216                     ) {
3217                         let src = put_input_in_reg(ctx, inputs[0]);
3218                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3219                         ctx.emit(Inst::gen_move(dst, src, types::I64));
3220                         return Ok(());
3221                     }
3222                 }
3223 
3224                 let src = input_to_reg_mem(ctx, inputs[0]);
3225                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3226 
3227                 let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
3228                 assert_eq!(
3229                     src_ty.bits() < dst_ty.bits(),
3230                     ext_mode.is_some(),
3231                     "unexpected extension: {} -> {}",
3232                     src_ty,
3233                     dst_ty
3234                 );
3235 
3236                 if let Some(ext_mode) = ext_mode {
3237                     if op == Opcode::Sextend {
3238                         ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
3239                     } else {
3240                         ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
3241                     }
3242                 } else {
3243                     ctx.emit(Inst::mov64_rm_r(src, dst));
3244                 }
3245             }
3246         }
3247 
3248         Opcode::Icmp => {
3249             let condcode = ctx.data(insn).cond_code().unwrap();
3250             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3251             let ty = ctx.input_ty(insn, 0);
3252             if !ty.is_vector() {
3253                 let condcode = emit_cmp(ctx, insn, condcode);
3254                 let cc = CC::from_intcc(condcode);
3255                 ctx.emit(Inst::setcc(cc, dst));
3256             } else {
3257                 assert_eq!(ty.bits(), 128);
3258                 let eq = |ty| match ty {
3259                     types::I8X16 => SseOpcode::Pcmpeqb,
3260                     types::I16X8 => SseOpcode::Pcmpeqw,
3261                     types::I32X4 => SseOpcode::Pcmpeqd,
3262                     types::I64X2 => SseOpcode::Pcmpeqq,
3263                     _ => panic!(
3264                         "Unable to find an instruction for {} for type: {}",
3265                         condcode, ty
3266                     ),
3267                 };
3268                 let gt = |ty| match ty {
3269                     types::I8X16 => SseOpcode::Pcmpgtb,
3270                     types::I16X8 => SseOpcode::Pcmpgtw,
3271                     types::I32X4 => SseOpcode::Pcmpgtd,
3272                     types::I64X2 => SseOpcode::Pcmpgtq,
3273                     _ => panic!(
3274                         "Unable to find an instruction for {} for type: {}",
3275                         condcode, ty
3276                     ),
3277                 };
3278                 let maxu = |ty| match ty {
3279                     types::I8X16 => SseOpcode::Pmaxub,
3280                     types::I16X8 => SseOpcode::Pmaxuw,
3281                     types::I32X4 => SseOpcode::Pmaxud,
3282                     _ => panic!(
3283                         "Unable to find an instruction for {} for type: {}",
3284                         condcode, ty
3285                     ),
3286                 };
3287                 let mins = |ty| match ty {
3288                     types::I8X16 => SseOpcode::Pminsb,
3289                     types::I16X8 => SseOpcode::Pminsw,
3290                     types::I32X4 => SseOpcode::Pminsd,
3291                     _ => panic!(
3292                         "Unable to find an instruction for {} for type: {}",
3293                         condcode, ty
3294                     ),
3295                 };
3296                 let minu = |ty| match ty {
3297                     types::I8X16 => SseOpcode::Pminub,
3298                     types::I16X8 => SseOpcode::Pminuw,
3299                     types::I32X4 => SseOpcode::Pminud,
3300                     _ => panic!(
3301                         "Unable to find an instruction for {} for type: {}",
3302                         condcode, ty
3303                     ),
3304                 };
3305 
3306                 // Here we decide which operand to use as the read/write `dst` (ModRM reg field) and
3307                 // which to use as the read `input` (ModRM r/m field). In the normal case we use
3308                 // Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for the
3309                 // less-than cases so that we can reuse the greater-than implementation.
3310                 //
3311                 // In a surprising twist, the operands for i64x2 `gte`/`sle` must also be flipped
3312                 // from the normal order because of the special-case lowering for these instructions
3313                 // (i.e. we use PCMPGTQ with flipped operands and negate the result).
3314                 let input = match condcode {
3315                     IntCC::SignedLessThanOrEqual if ty == types::I64X2 => {
3316                         let lhs = put_input_in_reg(ctx, inputs[0]);
3317                         let rhs = input_to_reg_mem(ctx, inputs[1]);
3318                         ctx.emit(Inst::gen_move(dst, lhs, ty));
3319                         rhs
3320                     }
3321                     IntCC::SignedGreaterThanOrEqual if ty == types::I64X2 => {
3322                         let lhs = input_to_reg_mem(ctx, inputs[0]);
3323                         let rhs = put_input_in_reg(ctx, inputs[1]);
3324                         ctx.emit(Inst::gen_move(dst, rhs, ty));
3325                         lhs
3326                     }
3327                     IntCC::SignedLessThan
3328                     | IntCC::SignedLessThanOrEqual
3329                     | IntCC::UnsignedLessThan
3330                     | IntCC::UnsignedLessThanOrEqual => {
3331                         let lhs = input_to_reg_mem(ctx, inputs[0]);
3332                         let rhs = put_input_in_reg(ctx, inputs[1]);
3333                         ctx.emit(Inst::gen_move(dst, rhs, ty));
3334                         lhs
3335                     }
3336                     _ => {
3337                         let lhs = put_input_in_reg(ctx, inputs[0]);
3338                         let rhs = input_to_reg_mem(ctx, inputs[1]);
3339                         ctx.emit(Inst::gen_move(dst, lhs, ty));
3340                         rhs
3341                     }
3342                 };
3343 
3344                 match condcode {
3345                     IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
3346                     IntCC::NotEqual => {
3347                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
3348                         // Emit all 1s into the `tmp` register.
3349                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3350                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3351                         // Invert the result of the `PCMPEQ*`.
3352                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3353                     }
3354                     IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
3355                         ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
3356                     }
3357                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
3358                         if ty != types::I64X2 =>
3359                     {
3360                         ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
3361                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
3362                     }
3363                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
3364                         if ty == types::I64X2 =>
3365                     {
3366                         // The PMINS* instruction is only available in AVX512VL/F so we must instead
3367                         // compare with flipped operands and negate the result (emitting one more
3368                         // instruction).
3369                         ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst));
3370                         // Emit all 1s into the `tmp` register.
3371                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3372                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3373                         // Invert the result of the `PCMPGT*`.
3374                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3375                     }
3376                     IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
3377                         ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
3378                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
3379                         // Emit all 1s into the `tmp` register.
3380                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3381                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3382                         // Invert the result of the `PCMPEQ*`.
3383                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3384                     }
3385                     IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
3386                         ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
3387                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
3388                     }
3389                     _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
3390                 }
3391             }
3392         }
3393 
3394         Opcode::Fcmp => {
3395             let cond_code = ctx.data(insn).fp_cond_code().unwrap();
3396             let input_ty = ctx.input_ty(insn, 0);
3397             if !input_ty.is_vector() {
3398                 // Unordered is returned by setting ZF, PF, CF <- 111
3399                 // Greater than by ZF, PF, CF <- 000
3400                 // Less than by ZF, PF, CF <- 001
3401                 // Equal by ZF, PF, CF <- 100
3402                 //
3403                 // Checking the result of comiss is somewhat annoying because you don't have setcc
3404                 // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
3405                 // gt, etc) *and* orderedness.
3406                 //
3407                 // So that might mean we need more than one setcc check and then a logical "and" or
3408                 // "or" to determine both, in some cases.  However knowing that if the parity bit is
3409                 // set, then the result was considered unordered and knowing that if the parity bit is
3410                 // set, then both the ZF and CF flag bits must also be set we can get away with using
3411                 // one setcc for most condition codes.
3412 
3413                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3414 
3415                 match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
3416                     FcmpCondResult::Condition(cc) => {
3417                         ctx.emit(Inst::setcc(cc, dst));
3418                     }
3419                     FcmpCondResult::AndConditions(cc1, cc2) => {
3420                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3421                         ctx.emit(Inst::setcc(cc1, tmp));
3422                         ctx.emit(Inst::setcc(cc2, dst));
3423                         ctx.emit(Inst::alu_rmi_r(
3424                             OperandSize::Size32,
3425                             AluRmiROpcode::And,
3426                             RegMemImm::reg(tmp.to_reg()),
3427                             dst,
3428                         ));
3429                     }
3430                     FcmpCondResult::OrConditions(cc1, cc2) => {
3431                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3432                         ctx.emit(Inst::setcc(cc1, tmp));
3433                         ctx.emit(Inst::setcc(cc2, dst));
3434                         ctx.emit(Inst::alu_rmi_r(
3435                             OperandSize::Size32,
3436                             AluRmiROpcode::Or,
3437                             RegMemImm::reg(tmp.to_reg()),
3438                             dst,
3439                         ));
3440                     }
3441                     FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
3442                 }
3443             } else {
3444                 let op = match input_ty {
3445                     types::F32X4 => SseOpcode::Cmpps,
3446                     types::F64X2 => SseOpcode::Cmppd,
3447                     _ => panic!("Bad input type to fcmp: {}", input_ty),
3448                 };
3449 
3450                 // Since some packed comparisons are not available, some of the condition codes
3451                 // must be inverted, with a corresponding `flip` of the operands.
3452                 let (imm, flip) = match cond_code {
3453                     FloatCC::GreaterThan => (FcmpImm::LessThan, true),
3454                     FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
3455                     FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
3456                     FloatCC::UnorderedOrLessThanOrEqual => {
3457                         (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
3458                     }
3459                     FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
3460                         panic!("unsupported float condition code: {}", cond_code)
3461                     }
3462                     _ => (FcmpImm::from(cond_code), false),
3463                 };
3464 
3465                 // Determine the operands of the comparison, possibly by flipping them.
3466                 let (lhs, rhs) = if flip {
3467                     (
3468                         put_input_in_reg(ctx, inputs[1]),
3469                         input_to_reg_mem(ctx, inputs[0]),
3470                     )
3471                 } else {
3472                     (
3473                         put_input_in_reg(ctx, inputs[0]),
3474                         input_to_reg_mem(ctx, inputs[1]),
3475                     )
3476                 };
3477 
3478                 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
3479                 // but ensures that the registers are the same to match x86's read-write operand
3480                 // encoding.
3481                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3482                 ctx.emit(Inst::gen_move(dst, lhs, input_ty));
3483 
3484                 // Emit the comparison.
3485                 ctx.emit(Inst::xmm_rm_r_imm(
3486                     op,
3487                     rhs,
3488                     dst,
3489                     imm.encode(),
3490                     OperandSize::Size32,
3491                 ));
3492             }
3493         }
3494 
3495         Opcode::FallthroughReturn | Opcode::Return => {
3496             for i in 0..ctx.num_inputs(insn) {
3497                 let src_reg = put_input_in_regs(ctx, inputs[i]);
3498                 let retval_reg = ctx.retval(i);
3499                 let ty = ctx.input_ty(insn, i);
3500                 assert!(src_reg.len() == retval_reg.len());
3501                 let (_, tys) = Inst::rc_for_type(ty)?;
3502                 for ((&src, &dst), &ty) in src_reg
3503                     .regs()
3504                     .iter()
3505                     .zip(retval_reg.regs().iter())
3506                     .zip(tys.iter())
3507                 {
3508                     ctx.emit(Inst::gen_move(dst, src, ty));
3509                 }
3510             }
3511             // N.B.: the Ret itself is generated by the ABI.
3512         }
3513 
3514         Opcode::Call | Opcode::CallIndirect => {
3515             let caller_conv = ctx.abi().call_conv();
3516             let (mut abi, inputs) = match op {
3517                 Opcode::Call => {
3518                     let (extname, dist) = ctx.call_target(insn).unwrap();
3519                     let sig = ctx.call_sig(insn).unwrap();
3520                     assert_eq!(inputs.len(), sig.params.len());
3521                     assert_eq!(outputs.len(), sig.returns.len());
3522                     (
3523                         X64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
3524                         &inputs[..],
3525                     )
3526                 }
3527 
3528                 Opcode::CallIndirect => {
3529                     let ptr = put_input_in_reg(ctx, inputs[0]);
3530                     let sig = ctx.call_sig(insn).unwrap();
3531                     assert_eq!(inputs.len() - 1, sig.params.len());
3532                     assert_eq!(outputs.len(), sig.returns.len());
3533                     (
3534                         X64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
3535                         &inputs[1..],
3536                     )
3537                 }
3538 
3539                 _ => unreachable!(),
3540             };
3541 
3542             abi.emit_stack_pre_adjust(ctx);
3543             assert_eq!(inputs.len(), abi.num_args());
3544             for i in abi.get_copy_to_arg_order() {
3545                 let input = inputs[i];
3546                 let arg_regs = put_input_in_regs(ctx, input);
3547                 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
3548             }
3549             abi.emit_call(ctx);
3550             for (i, output) in outputs.iter().enumerate() {
3551                 let retval_regs = get_output_reg(ctx, *output);
3552                 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
3553             }
3554             abi.emit_stack_post_adjust(ctx);
3555         }
3556 
3557         Opcode::Debugtrap => {
3558             ctx.emit(Inst::Hlt);
3559         }
3560 
3561         Opcode::Trap | Opcode::ResumableTrap => {
3562             let trap_code = ctx.data(insn).trap_code().unwrap();
3563             ctx.emit_safepoint(Inst::Ud2 { trap_code });
3564         }
3565 
3566         Opcode::Trapif | Opcode::Trapff => {
3567             let trap_code = ctx.data(insn).trap_code().unwrap();
3568 
3569             if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
3570                 let cond_code = ctx.data(insn).cond_code().unwrap();
3571                 // The flags must not have been clobbered by any other instruction between the
3572                 // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
3573                 // simply use the flags here.
3574                 let cc = CC::from_intcc(cond_code);
3575 
3576                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
3577             } else if op == Opcode::Trapif {
3578                 let cond_code = ctx.data(insn).cond_code().unwrap();
3579 
3580                 // Verification ensures that the input is always a single-def ifcmp.
3581                 let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
3582                 let cond_code = emit_cmp(ctx, ifcmp, cond_code);
3583                 let cc = CC::from_intcc(cond_code);
3584 
3585                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
3586             } else {
3587                 let cond_code = ctx.data(insn).fp_cond_code().unwrap();
3588 
3589                 // Verification ensures that the input is always a single-def ffcmp.
3590                 let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
3591 
3592                 match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
3593                     FcmpCondResult::Condition(cc) => {
3594                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc })
3595                     }
3596                     FcmpCondResult::AndConditions(cc1, cc2) => {
3597                         // A bit unfortunate, but materialize the flags in their own register, and
3598                         // check against this.
3599                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3600                         let tmp2 = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3601                         ctx.emit(Inst::setcc(cc1, tmp));
3602                         ctx.emit(Inst::setcc(cc2, tmp2));
3603                         ctx.emit(Inst::alu_rmi_r(
3604                             OperandSize::Size32,
3605                             AluRmiROpcode::And,
3606                             RegMemImm::reg(tmp.to_reg()),
3607                             tmp2,
3608                         ));
3609                         ctx.emit_safepoint(Inst::TrapIf {
3610                             trap_code,
3611                             cc: CC::NZ,
3612                         });
3613                     }
3614                     FcmpCondResult::OrConditions(cc1, cc2) => {
3615                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 });
3616                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 });
3617                     }
3618                     FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
3619                 };
3620             };
3621         }
3622 
3623         Opcode::F64const => {
3624             // TODO use cmpeqpd for all 1s.
3625             let value = ctx.get_constant(insn).unwrap();
3626             let dst = get_output_reg(ctx, outputs[0]);
3627             for inst in Inst::gen_constant(dst, value as u128, types::F64, |ty| {
3628                 ctx.alloc_tmp(ty).only_reg().unwrap()
3629             }) {
3630                 ctx.emit(inst);
3631             }
3632         }
3633 
3634         Opcode::F32const => {
3635             // TODO use cmpeqps for all 1s.
3636             let value = ctx.get_constant(insn).unwrap();
3637             let dst = get_output_reg(ctx, outputs[0]);
3638             for inst in Inst::gen_constant(dst, value as u128, types::F32, |ty| {
3639                 ctx.alloc_tmp(ty).only_reg().unwrap()
3640             }) {
3641                 ctx.emit(inst);
3642             }
3643         }
3644 
3645         Opcode::WideningPairwiseDotProductS => {
3646             let lhs = put_input_in_reg(ctx, inputs[0]);
3647             let rhs = input_to_reg_mem(ctx, inputs[1]);
3648             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3649             let ty = ty.unwrap();
3650 
3651             ctx.emit(Inst::gen_move(dst, lhs, ty));
3652 
3653             if ty == types::I32X4 {
3654                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
3655             } else {
3656                 panic!(
3657                     "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
3658                     ty
3659                 );
3660             }
3661         }
3662 
3663         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
3664             let lhs = put_input_in_reg(ctx, inputs[0]);
3665             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3666             // must avoid merging a load here.
3667             let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
3668             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3669             let ty = ty.unwrap();
3670 
3671             // Move the `lhs` to the same register as `dst`; this may not emit an actual move
3672             // but ensures that the registers are the same to match x86's read-write operand
3673             // encoding.
3674             ctx.emit(Inst::gen_move(dst, lhs, ty));
3675 
3676             // Note: min and max can't be handled here, because of the way Cranelift defines them:
3677             // if any operand is a NaN, they must return the NaN operand, while the x86 machine
3678             // instruction will return the second operand if either operand is a NaN.
3679             let sse_op = match ty {
3680                 types::F32 => match op {
3681                     Opcode::Fadd => SseOpcode::Addss,
3682                     Opcode::Fsub => SseOpcode::Subss,
3683                     Opcode::Fmul => SseOpcode::Mulss,
3684                     Opcode::Fdiv => SseOpcode::Divss,
3685                     _ => unreachable!(),
3686                 },
3687                 types::F64 => match op {
3688                     Opcode::Fadd => SseOpcode::Addsd,
3689                     Opcode::Fsub => SseOpcode::Subsd,
3690                     Opcode::Fmul => SseOpcode::Mulsd,
3691                     Opcode::Fdiv => SseOpcode::Divsd,
3692                     _ => unreachable!(),
3693                 },
3694                 types::F32X4 => match op {
3695                     Opcode::Fadd => SseOpcode::Addps,
3696                     Opcode::Fsub => SseOpcode::Subps,
3697                     Opcode::Fmul => SseOpcode::Mulps,
3698                     Opcode::Fdiv => SseOpcode::Divps,
3699                     _ => unreachable!(),
3700                 },
3701                 types::F64X2 => match op {
3702                     Opcode::Fadd => SseOpcode::Addpd,
3703                     Opcode::Fsub => SseOpcode::Subpd,
3704                     Opcode::Fmul => SseOpcode::Mulpd,
3705                     Opcode::Fdiv => SseOpcode::Divpd,
3706                     _ => unreachable!(),
3707                 },
3708                 _ => panic!(
3709                     "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
3710                     ty
3711                 ),
3712             };
3713             ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
3714         }
3715 
3716         Opcode::Fmin | Opcode::Fmax => {
3717             let lhs = put_input_in_reg(ctx, inputs[0]);
3718             let rhs = put_input_in_reg(ctx, inputs[1]);
3719             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3720             let is_min = op == Opcode::Fmin;
3721             let output_ty = ty.unwrap();
3722             ctx.emit(Inst::gen_move(dst, rhs, output_ty));
3723             if !output_ty.is_vector() {
3724                 let op_size = match output_ty {
3725                     types::F32 => OperandSize::Size32,
3726                     types::F64 => OperandSize::Size64,
3727                     _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
3728                 };
3729                 ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
3730             } else {
3731                 // X64's implementation of floating point min and floating point max does not
3732                 // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
3733                 // scalar approach we use jumps to handle cases where NaN and +0 propagation is
3734                 // not consistent with what is needed. However for packed floating point min and
3735                 // floating point max we implement a different approach to avoid the sequence
3736                 // of jumps that would be required on a per lane basis. Because we do not need to
3737                 // lower labels and jumps but do need ctx for creating temporaries we implement
3738                 // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
3739                 // The outline of approach is as follows:
3740                 //
3741                 // First we preform the Min/Max in both directions. This is because in the
3742                 // case of an operand's lane containing a NaN or in the case of the lanes of the
3743                 // two operands containing 0 but with mismatched signs, x64 will return the second
3744                 // operand regardless of its contents. So in order to make sure we capture NaNs and
3745                 // normalize NaNs and 0 values we capture the operation in both directions and merge the
3746                 // results. Then we normalize the results through operations that create a mask for the
3747                 // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
3748                 // 0s.
3749                 //
3750                 // The following sequence is generated for min:
3751                 //
3752                 // movap{s,d} %lhs, %tmp
3753                 // minp{s,d} %dst, %tmp
3754                 // minp,{s,d} %lhs, %dst
3755                 // orp{s,d} %dst, %tmp
3756                 // cmpp{s,d} %tmp, %dst, $3
3757                 // orps{s,d} %dst, %tmp
3758                 // psrl{s,d} {$10, $13}, %dst
3759                 // andnp{s,d} %tmp, %dst
3760                 //
3761                 // and for max the sequence is:
3762                 //
3763                 // movap{s,d} %lhs, %tmp
3764                 // minp{s,d} %dst, %tmp
3765                 // minp,{s,d} %lhs, %dst
3766                 // xorp{s,d} %tmp, %dst
3767                 // orp{s,d} %dst, %tmp
3768                 // subp{s,d} %dst, %tmp
3769                 // cmpp{s,d} %tmp, %dst, $3
3770                 // psrl{s,d} {$10, $13}, %dst
3771                 // andnp{s,d} %tmp, %dst
3772 
3773                 if is_min {
3774                     let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
3775                         match output_ty {
3776                             types::F32X4 => (
3777                                 SseOpcode::Movaps,
3778                                 SseOpcode::Minps,
3779                                 SseOpcode::Orps,
3780                                 SseOpcode::Cmpps,
3781                                 SseOpcode::Psrld,
3782                                 10,
3783                                 SseOpcode::Andnps,
3784                             ),
3785                             types::F64X2 => (
3786                                 SseOpcode::Movapd,
3787                                 SseOpcode::Minpd,
3788                                 SseOpcode::Orpd,
3789                                 SseOpcode::Cmppd,
3790                                 SseOpcode::Psrlq,
3791                                 13,
3792                                 SseOpcode::Andnpd,
3793                             ),
3794                             _ => unimplemented!("unsupported op type {:?}", output_ty),
3795                         };
3796 
3797                     // Copy lhs into tmp
3798                     let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
3799                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
3800 
3801                     // Perform min in reverse direction
3802                     ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
3803 
3804                     // Perform min in original direction
3805                     ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
3806 
3807                     // X64 handles propagation of -0's and Nans differently between left and right
3808                     // operands. After doing the min in both directions, this OR will
3809                     // guarrentee capture of -0's and Nan in our tmp register
3810                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
3811 
3812                     // Compare unordered to create mask for lanes containing NaNs and then use
3813                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
3814                     // TODO: Would a check for NaN and then a jump be better here in the
3815                     // common case than continuing on to normalize NaNs that might not exist?
3816                     let cond = FcmpImm::from(FloatCC::Unordered);
3817                     ctx.emit(Inst::xmm_rm_r_imm(
3818                         cmp_op,
3819                         RegMem::reg(tmp_xmm1.to_reg()),
3820                         dst,
3821                         cond.encode(),
3822                         OperandSize::Size32,
3823                     ));
3824                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3825 
3826                     // The dst register holds a mask for lanes containing NaNs.
3827                     // We take that mask and shift in preparation for creating a different mask
3828                     // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
3829                     // number of least signficant bits. We shift right each lane by 10 bits
3830                     // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
3831                     // 11 exp. + 1 MSB sig.) for F64X2.
3832                     ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
3833 
3834                     // Finally we do a nand with the tmp register to produce the final results
3835                     // in the dst.
3836                     ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3837                 } else {
3838                     let (
3839                         mov_op,
3840                         max_op,
3841                         xor_op,
3842                         or_op,
3843                         sub_op,
3844                         cmp_op,
3845                         shift_op,
3846                         shift_by,
3847                         andn_op,
3848                     ) = match output_ty {
3849                         types::F32X4 => (
3850                             SseOpcode::Movaps,
3851                             SseOpcode::Maxps,
3852                             SseOpcode::Xorps,
3853                             SseOpcode::Orps,
3854                             SseOpcode::Subps,
3855                             SseOpcode::Cmpps,
3856                             SseOpcode::Psrld,
3857                             10,
3858                             SseOpcode::Andnps,
3859                         ),
3860                         types::F64X2 => (
3861                             SseOpcode::Movapd,
3862                             SseOpcode::Maxpd,
3863                             SseOpcode::Xorpd,
3864                             SseOpcode::Orpd,
3865                             SseOpcode::Subpd,
3866                             SseOpcode::Cmppd,
3867                             SseOpcode::Psrlq,
3868                             13,
3869                             SseOpcode::Andnpd,
3870                         ),
3871                         _ => unimplemented!("unsupported op type {:?}", output_ty),
3872                     };
3873 
3874                     // Copy lhs into tmp.
3875                     let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
3876                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
3877 
3878                     // Perform max in reverse direction.
3879                     ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3880 
3881                     // Perform max in original direction.
3882                     ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
3883 
3884                     // Get the difference between the two results and store in tmp.
3885                     // Max uses a different approach than min to account for potential
3886                     // discrepancies with plus/minus 0.
3887                     ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3888 
3889                     // X64 handles propagation of -0's and Nans differently between left and right
3890                     // operands. After doing the max in both directions, this OR will
3891                     // guarentee capture of 0's and Nan in our tmp register.
3892                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3893 
3894                     // Capture NaNs and sign discrepancies.
3895                     ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3896 
3897                     // Compare unordered to create mask for lanes containing NaNs and then use
3898                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
3899                     let cond = FcmpImm::from(FloatCC::Unordered);
3900                     ctx.emit(Inst::xmm_rm_r_imm(
3901                         cmp_op,
3902                         RegMem::reg(tmp_xmm1.to_reg()),
3903                         dst,
3904                         cond.encode(),
3905                         OperandSize::Size32,
3906                     ));
3907 
3908                     // The dst register holds a mask for lanes containing NaNs.
3909                     // We take that mask and shift in preparation for creating a different mask
3910                     // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
3911                     // number of least signficant bits. We shift right each lane by 10 bits
3912                     // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
3913                     // 11 exp. + 1 MSB sig.) for F64X2.
3914                     ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
3915 
3916                     // Finally we do a nand with the tmp register to produce the final results
3917                     // in the dst.
3918                     ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3919                 }
3920             }
3921         }
3922 
3923         Opcode::FminPseudo | Opcode::FmaxPseudo => {
3924             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3925             // must avoid merging a load here.
3926             let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3927             let rhs = put_input_in_reg(ctx, inputs[1]);
3928             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3929             let ty = ty.unwrap();
3930             ctx.emit(Inst::gen_move(dst, rhs, ty));
3931             let sse_opcode = match (ty, op) {
3932                 (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
3933                 (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
3934                 (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
3935                 (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
3936                 _ => unimplemented!("unsupported type {} for {}", ty, op),
3937             };
3938             ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
3939         }
3940 
3941         Opcode::Sqrt => {
3942             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3943             // must avoid merging a load here.
3944             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3945             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3946             let ty = ty.unwrap();
3947 
3948             let sse_op = match ty {
3949                 types::F32 => SseOpcode::Sqrtss,
3950                 types::F64 => SseOpcode::Sqrtsd,
3951                 types::F32X4 => SseOpcode::Sqrtps,
3952                 types::F64X2 => SseOpcode::Sqrtpd,
3953                 _ => panic!(
3954                     "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
3955                     ty
3956                 ),
3957             };
3958 
3959             ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
3960         }
3961 
3962         Opcode::Fpromote => {
3963             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3964             // must avoid merging a load here.
3965             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3966             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3967             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
3968         }
3969 
3970         Opcode::Fdemote => {
3971             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3972             // must avoid merging a load here.
3973             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3974             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3975             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
3976         }
3977 
3978         Opcode::FcvtFromSint => {
3979             let output_ty = ty.unwrap();
3980             if !output_ty.is_vector() {
3981                 let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
3982                     types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
3983                     types::I32 => (None, OperandSize::Size32),
3984                     types::I64 => (None, OperandSize::Size64),
3985                     _ => unreachable!(),
3986                 };
3987 
3988                 let src = match ext_spec {
3989                     Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
3990                     None => RegMem::reg(put_input_in_reg(ctx, inputs[0])),
3991                 };
3992 
3993                 let opcode = if output_ty == types::F32 {
3994                     SseOpcode::Cvtsi2ss
3995                 } else {
3996                     assert_eq!(output_ty, types::F64);
3997                     SseOpcode::Cvtsi2sd
3998                 };
3999                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4000                 ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
4001             } else {
4002                 let ty = ty.unwrap();
4003                 let src = put_input_in_reg(ctx, inputs[0]);
4004                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4005                 let opcode = match ctx.input_ty(insn, 0) {
4006                     types::I32X4 => SseOpcode::Cvtdq2ps,
4007                     _ => {
4008                         unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op)
4009                     }
4010                 };
4011                 ctx.emit(Inst::gen_move(dst, src, ty));
4012                 ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
4013             }
4014         }
4015         Opcode::FcvtLowFromSint => {
4016             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
4017             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4018             ctx.emit(Inst::xmm_unary_rm_r(
4019                 SseOpcode::Cvtdq2pd,
4020                 RegMem::from(src),
4021                 dst,
4022             ));
4023         }
4024         Opcode::FcvtFromUint => {
4025             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4026             let ty = ty.unwrap();
4027 
4028             let input_ty = ctx.input_ty(insn, 0);
4029             if !ty.is_vector() {
4030                 match input_ty {
4031                     types::I8 | types::I16 | types::I32 => {
4032                         // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
4033                         // do a signed conversion (which won't overflow).
4034                         let opcode = if ty == types::F32 {
4035                             SseOpcode::Cvtsi2ss
4036                         } else {
4037                             assert_eq!(ty, types::F64);
4038                             SseOpcode::Cvtsi2sd
4039                         };
4040 
4041                         let src = RegMem::reg(extend_input_to_reg(
4042                             ctx,
4043                             inputs[0],
4044                             ExtSpec::ZeroExtendTo64,
4045                         ));
4046                         ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
4047                     }
4048 
4049                     types::I64 => {
4050                         let src = put_input_in_reg(ctx, inputs[0]);
4051 
4052                         let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4053                         ctx.emit(Inst::gen_move(src_copy, src, types::I64));
4054 
4055                         let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4056                         let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4057                         ctx.emit(Inst::cvt_u64_to_float_seq(
4058                             if ty == types::F64 {
4059                                 OperandSize::Size64
4060                             } else {
4061                                 OperandSize::Size32
4062                             },
4063                             src_copy,
4064                             tmp_gpr1,
4065                             tmp_gpr2,
4066                             dst,
4067                         ));
4068                     }
4069                     _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
4070                 };
4071             } else {
4072                 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
4073                 let src = put_input_in_reg(ctx, inputs[0]);
4074                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4075 
4076                 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
4077                     // When either AVX512VL or AVX512F are available,
4078                     // `fcvt_from_uint` can be lowered to a single instruction.
4079                     ctx.emit(Inst::xmm_unary_rm_r_evex(
4080                         Avx512Opcode::Vcvtudq2ps,
4081                         RegMem::reg(src),
4082                         dst,
4083                     ));
4084                 } else {
4085                     // Converting packed unsigned integers to packed floats
4086                     // requires a few steps. There is no single instruction
4087                     // lowering for converting unsigned floats but there is for
4088                     // converting packed signed integers to float (cvtdq2ps). In
4089                     // the steps below we isolate the upper half (16 bits) and
4090                     // lower half (16 bits) of each lane and then we convert
4091                     // each half separately using cvtdq2ps meant for signed
4092                     // integers. In order for this to work for the upper half
4093                     // bits we must shift right by 1 (divide by 2) these bits in
4094                     // order to ensure the most significant bit is 0 not signed,
4095                     // and then after the conversion we double the value.
4096                     // Finally we add the converted values where addition will
4097                     // correctly round.
4098                     //
4099                     // Sequence:
4100                     // -> A = 0xffffffff
4101                     // -> Ah = 0xffff0000
4102                     // -> Al = 0x0000ffff
4103                     // -> Convert(Al) // Convert int to float
4104                     // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
4105                     // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
4106                     // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
4107                     // -> dst = Ah + Al // Add the two floats together
4108 
4109                     // Create a temporary register
4110                     let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4111                     ctx.emit(Inst::xmm_unary_rm_r(
4112                         SseOpcode::Movapd,
4113                         RegMem::reg(src),
4114                         tmp,
4115                     ));
4116                     ctx.emit(Inst::gen_move(dst, src, ty));
4117 
4118                     // Get the low 16 bits
4119                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
4120                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
4121 
4122                     // Get the high 16 bits
4123                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
4124 
4125                     // Convert the low 16 bits
4126                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
4127 
4128                     // Shift the high bits by 1, convert, and double to get the correct value.
4129                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
4130                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
4131                     ctx.emit(Inst::xmm_rm_r(
4132                         SseOpcode::Addps,
4133                         RegMem::reg(dst.to_reg()),
4134                         dst,
4135                     ));
4136 
4137                     // Add together the two converted values.
4138                     ctx.emit(Inst::xmm_rm_r(
4139                         SseOpcode::Addps,
4140                         RegMem::reg(tmp.to_reg()),
4141                         dst,
4142                     ));
4143                 }
4144             }
4145         }
4146 
4147         Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
4148             let src = put_input_in_reg(ctx, inputs[0]);
4149             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4150 
4151             let input_ty = ctx.input_ty(insn, 0);
4152             if !input_ty.is_vector() {
4153                 let src_size = if input_ty == types::F32 {
4154                     OperandSize::Size32
4155                 } else {
4156                     assert_eq!(input_ty, types::F64);
4157                     OperandSize::Size64
4158                 };
4159 
4160                 let output_ty = ty.unwrap();
4161                 let dst_size = if output_ty == types::I32 {
4162                     OperandSize::Size32
4163                 } else {
4164                     assert_eq!(output_ty, types::I64);
4165                     OperandSize::Size64
4166                 };
4167 
4168                 let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
4169                 let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
4170 
4171                 let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
4172                 ctx.emit(Inst::gen_move(src_copy, src, input_ty));
4173 
4174                 let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
4175                 let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();
4176 
4177                 if to_signed {
4178                     ctx.emit(Inst::cvt_float_to_sint_seq(
4179                         src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
4180                     ));
4181                 } else {
4182                     ctx.emit(Inst::cvt_float_to_uint_seq(
4183                         src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
4184                     ));
4185                 }
4186             } else {
4187                 if op == Opcode::FcvtToSintSat {
4188                     // Sets destination to zero if float is NaN
4189                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
4190                     let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4191                     ctx.emit(Inst::xmm_unary_rm_r(
4192                         SseOpcode::Movapd,
4193                         RegMem::reg(src),
4194                         tmp,
4195                     ));
4196                     ctx.emit(Inst::gen_move(dst, src, input_ty));
4197                     let cond = FcmpImm::from(FloatCC::Equal);
4198                     ctx.emit(Inst::xmm_rm_r_imm(
4199                         SseOpcode::Cmpps,
4200                         RegMem::reg(tmp.to_reg()),
4201                         tmp,
4202                         cond.encode(),
4203                         OperandSize::Size32,
4204                     ));
4205                     ctx.emit(Inst::xmm_rm_r(
4206                         SseOpcode::Andps,
4207                         RegMem::reg(tmp.to_reg()),
4208                         dst,
4209                     ));
4210 
4211                     // Sets top bit of tmp if float is positive
4212                     // Setting up to set top bit on negative float values
4213                     ctx.emit(Inst::xmm_rm_r(
4214                         SseOpcode::Pxor,
4215                         RegMem::reg(dst.to_reg()),
4216                         tmp,
4217                     ));
4218 
4219                     // Convert the packed float to packed doubleword.
4220                     ctx.emit(Inst::xmm_rm_r(
4221                         SseOpcode::Cvttps2dq,
4222                         RegMem::reg(dst.to_reg()),
4223                         dst,
4224                     ));
4225 
4226                     // Set top bit only if < 0
4227                     // Saturate lane with sign (top) bit.
4228                     ctx.emit(Inst::xmm_rm_r(
4229                         SseOpcode::Pand,
4230                         RegMem::reg(dst.to_reg()),
4231                         tmp,
4232                     ));
4233                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
4234 
4235                     // On overflow 0x80000000 is returned to a lane.
4236                     // Below sets positive overflow lanes to 0x7FFFFFFF
4237                     // Keeps negative overflow lanes as is.
4238                     ctx.emit(Inst::xmm_rm_r(
4239                         SseOpcode::Pxor,
4240                         RegMem::reg(tmp.to_reg()),
4241                         dst,
4242                     ));
4243                 } else if op == Opcode::FcvtToUintSat {
4244                     // The algorithm for converting floats to unsigned ints is a little tricky. The
4245                     // complication arises because we are converting from a signed 64-bit int with a positive
4246                     // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
4247                     // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
4248                     // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
4249                     // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
4250                     // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
4251                     // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
4252                     // precisely INT_MAX values we can correctly account for and convert every value in this range
4253                     // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
4254                     // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
4255                     // After the conversion we add INT_MAX+1 back to this converted value, noting again that
4256                     // values we are trying to account for were already set to INT_MAX+1 during the original conversion.
4257                     // We simply have to create a mask and make sure we are adding together only the lanes that need
4258                     // to be accounted for. Digesting it all the steps then are:
4259                     //
4260                     // Step 1 - Account for NaN and negative floats by setting these src values to zero.
4261                     // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
4262                     //          reasons described above.
4263                     // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
4264                     // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
4265                     //          values that were originally in the range (0..INT_MAX). This will come in handy during
4266                     //          step 7 when we zero negative lanes.
4267                     // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
4268                     //          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
4269                     // Step 6 - Convert the second set of values (tmp1)
4270                     // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
4271                     //          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
4272                     //          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
4273                     // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
4274                     //          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
4275                     //          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
4276                     //          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
4277                     //
4278                     //
4279                     // The table below illustrates the result after each step where it matters for the converted set.
4280                     // Note the original value range (original src set) is the final dst in Step 8:
4281                     //
4282                     // Original src set:
4283                     // | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
4284                     // |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
4285                     //
4286                     // Copied src set (tmp1):
4287                     // |    Step 2    |                  Step 4                  |
4288                     // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
4289                     //
4290                     // |                       Step 6                        |                 Step 7                 |
4291                     // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
4292 
4293                     // Create temporaries
4294                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
4295                     let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4296                     let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4297 
4298                     // Converting to unsigned int so if float src is negative or NaN
4299                     // will first set to zero.
4300                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
4301                     ctx.emit(Inst::gen_move(dst, src, input_ty));
4302                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
4303 
4304                     // Set tmp2 to INT_MAX+1. It is important to note here that after it looks
4305                     // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
4306                     // single precision IEEE-754 floats can only accurately represent contingous
4307                     // integers up to 2^23 and outside of this range it rounds to the closest
4308                     // integer that it can represent. In the case of INT_MAX, this value gets
4309                     // represented as 0x4f000000 which is the integer value (INT_MAX+1).
4310 
4311                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
4312                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
4313                     ctx.emit(Inst::xmm_rm_r(
4314                         SseOpcode::Cvtdq2ps,
4315                         RegMem::from(tmp2),
4316                         tmp2,
4317                     ));
4318 
4319                     // Make a copy of these lanes and then do the first conversion.
4320                     // Overflow lanes greater than the maximum allowed signed value will
4321                     // set to 0x80000000. Negative and NaN lanes will be 0x0
4322                     ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
4323                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst));
4324 
4325                     // Set lanes to src - max_signed_int
4326                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
4327 
4328                     // Create mask for all positive lanes to saturate (i.e. greater than
4329                     // or equal to the maxmimum allowable unsigned int).
4330                     let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
4331                     ctx.emit(Inst::xmm_rm_r_imm(
4332                         SseOpcode::Cmpps,
4333                         RegMem::from(tmp1),
4334                         tmp2,
4335                         cond.encode(),
4336                         OperandSize::Size32,
4337                     ));
4338 
4339                     // Convert those set of lanes that have the max_signed_int factored out.
4340                     ctx.emit(Inst::xmm_rm_r(
4341                         SseOpcode::Cvttps2dq,
4342                         RegMem::from(tmp1),
4343                         tmp1,
4344                     ));
4345 
4346                     // Prepare converted lanes by zeroing negative lanes and prepping lanes
4347                     // that have positive overflow (based on the mask) by setting these lanes
4348                     // to 0x7FFFFFFF
4349                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
4350                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
4351                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
4352 
4353                     // Add this second set of converted lanes to the original to properly handle
4354                     // values greater than max signed int.
4355                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
4356                 } else {
4357                     // Since this branch is also guarded by a check for vector types
4358                     // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
4359                     // due to vector varients not existing. The first two branches will
4360                     // cover all reachable cases.
4361                     unreachable!();
4362                 }
4363             }
4364         }
4365         Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
4366             let input_ty = ctx.input_ty(insn, 0);
4367             let output_ty = ctx.output_ty(insn, 0);
4368             let src = put_input_in_reg(ctx, inputs[0]);
4369             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4370             if output_ty.is_vector() {
4371                 match op {
4372                     Opcode::SwidenLow => match (input_ty, output_ty) {
4373                         (types::I8X16, types::I16X8) => {
4374                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(src), dst));
4375                         }
4376                         (types::I16X8, types::I32X4) => {
4377                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::reg(src), dst));
4378                         }
4379                         _ => unreachable!(),
4380                     },
4381                     Opcode::SwidenHigh => match (input_ty, output_ty) {
4382                         (types::I8X16, types::I16X8) => {
4383                             ctx.emit(Inst::gen_move(dst, src, output_ty));
4384                             ctx.emit(Inst::xmm_rm_r_imm(
4385                                 SseOpcode::Palignr,
4386                                 RegMem::reg(src),
4387                                 dst,
4388                                 8,
4389                                 OperandSize::Size32,
4390                             ));
4391                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
4392                         }
4393                         (types::I16X8, types::I32X4) => {
4394                             ctx.emit(Inst::gen_move(dst, src, output_ty));
4395                             ctx.emit(Inst::xmm_rm_r_imm(
4396                                 SseOpcode::Palignr,
4397                                 RegMem::reg(src),
4398                                 dst,
4399                                 8,
4400                                 OperandSize::Size32,
4401                             ));
4402                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
4403                         }
4404                         _ => unreachable!(),
4405                     },
4406                     Opcode::UwidenLow => match (input_ty, output_ty) {
4407                         (types::I8X16, types::I16X8) => {
4408                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(src), dst));
4409                         }
4410                         (types::I16X8, types::I32X4) => {
4411                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
4412                         }
4413                         _ => unreachable!(),
4414                     },
4415                     Opcode::UwidenHigh => match (input_ty, output_ty) {
4416                         (types::I8X16, types::I16X8) => {
4417                             ctx.emit(Inst::gen_move(dst, src, output_ty));
4418                             ctx.emit(Inst::xmm_rm_r_imm(
4419                                 SseOpcode::Palignr,
4420                                 RegMem::reg(src),
4421                                 dst,
4422                                 8,
4423                                 OperandSize::Size32,
4424                             ));
4425                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
4426                         }
4427                         (types::I16X8, types::I32X4) => {
4428                             ctx.emit(Inst::gen_move(dst, src, output_ty));
4429                             ctx.emit(Inst::xmm_rm_r_imm(
4430                                 SseOpcode::Palignr,
4431                                 RegMem::reg(src),
4432                                 dst,
4433                                 8,
4434                                 OperandSize::Size32,
4435                             ));
4436                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
4437                         }
4438                         _ => unreachable!(),
4439                     },
4440                     _ => unreachable!(),
4441                 }
4442             } else {
4443                 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
4444             }
4445         }
4446         Opcode::Snarrow | Opcode::Unarrow => {
4447             let input_ty = ctx.input_ty(insn, 0);
4448             let output_ty = ctx.output_ty(insn, 0);
4449             let src1 = put_input_in_reg(ctx, inputs[0]);
4450             let src2 = put_input_in_reg(ctx, inputs[1]);
4451             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4452             if output_ty.is_vector() {
4453                 match op {
4454                     Opcode::Snarrow => match (input_ty, output_ty) {
4455                         (types::I16X8, types::I8X16) => {
4456                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
4457                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
4458                         }
4459                         (types::I32X4, types::I16X8) => {
4460                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
4461                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
4462                         }
4463                         _ => unreachable!(),
4464                     },
4465                     Opcode::Unarrow => match (input_ty, output_ty) {
4466                         (types::I16X8, types::I8X16) => {
4467                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
4468                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
4469                         }
4470                         (types::I32X4, types::I16X8) => {
4471                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
4472                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
4473                         }
4474                         _ => unreachable!(),
4475                     },
4476                     _ => unreachable!(),
4477                 }
4478             } else {
4479                 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
4480             }
4481         }
4482         Opcode::Bitcast => {
4483             let input_ty = ctx.input_ty(insn, 0);
4484             let output_ty = ctx.output_ty(insn, 0);
4485             match (input_ty, output_ty) {
4486                 (types::F32, types::I32) => {
4487                     let src = put_input_in_reg(ctx, inputs[0]);
4488                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4489                     ctx.emit(Inst::xmm_to_gpr(
4490                         SseOpcode::Movd,
4491                         src,
4492                         dst,
4493                         OperandSize::Size32,
4494                     ));
4495                 }
4496                 (types::I32, types::F32) => {
4497                     let src = input_to_reg_mem(ctx, inputs[0]);
4498                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4499                     ctx.emit(Inst::gpr_to_xmm(
4500                         SseOpcode::Movd,
4501                         src,
4502                         OperandSize::Size32,
4503                         dst,
4504                     ));
4505                 }
4506                 (types::F64, types::I64) => {
4507                     let src = put_input_in_reg(ctx, inputs[0]);
4508                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4509                     ctx.emit(Inst::xmm_to_gpr(
4510                         SseOpcode::Movq,
4511                         src,
4512                         dst,
4513                         OperandSize::Size64,
4514                     ));
4515                 }
4516                 (types::I64, types::F64) => {
4517                     let src = input_to_reg_mem(ctx, inputs[0]);
4518                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4519                     ctx.emit(Inst::gpr_to_xmm(
4520                         SseOpcode::Movq,
4521                         src,
4522                         OperandSize::Size64,
4523                         dst,
4524                     ));
4525                 }
4526                 _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
4527             }
4528         }
4529 
4530         Opcode::Fabs | Opcode::Fneg => {
4531             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
4532             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4533 
4534             // In both cases, generate a constant and apply a single binary instruction:
4535             // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
4536             // src with it.
4537             // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
4538             // src with it.
4539             let output_ty = ty.unwrap();
4540             if !output_ty.is_vector() {
4541                 let (val, opcode): (u64, _) = match output_ty {
4542                     types::F32 => match op {
4543                         Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
4544                         Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
4545                         _ => unreachable!(),
4546                     },
4547                     types::F64 => match op {
4548                         Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
4549                         Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
4550                         _ => unreachable!(),
4551                     },
4552                     _ => panic!("unexpected type {:?} for Fabs", output_ty),
4553                 };
4554 
4555                 for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| {
4556                     ctx.alloc_tmp(ty).only_reg().unwrap()
4557                 }) {
4558                     ctx.emit(inst);
4559                 }
4560 
4561                 ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
4562             } else {
4563                 // Eventually vector constants should be available in `gen_constant` and this block
4564                 // can be merged with the one above (TODO).
4565                 if output_ty.bits() == 128 {
4566                     // Move the `lhs` to the same register as `dst`; this may not emit an actual move
4567                     // but ensures that the registers are the same to match x86's read-write operand
4568                     // encoding.
4569                     let src = put_input_in_reg(ctx, inputs[0]);
4570                     ctx.emit(Inst::gen_move(dst, src, output_ty));
4571 
4572                     // Generate an all 1s constant in an XMM register. This uses CMPPS but could
4573                     // have used CMPPD with the same effect. Note, we zero the temp we allocate
4574                     // because if not, there is a chance that the register we use could be initialized
4575                     // with NaN .. in which case the CMPPS would fail since NaN != NaN.
4576                     let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap();
4577                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorps, RegMem::from(tmp), tmp));
4578                     let cond = FcmpImm::from(FloatCC::Equal);
4579                     let cmpps = Inst::xmm_rm_r_imm(
4580                         SseOpcode::Cmpps,
4581                         RegMem::reg(tmp.to_reg()),
4582                         tmp,
4583                         cond.encode(),
4584                         OperandSize::Size32,
4585                     );
4586                     ctx.emit(cmpps);
4587 
4588                     // Shift the all 1s constant to generate the mask.
4589                     let lane_bits = output_ty.lane_bits();
4590                     let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
4591                         (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
4592                         (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
4593                         (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
4594                         (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
4595                         _ => unreachable!(
4596                             "unexpected opcode and lane size: {:?}, {} bits",
4597                             op, lane_bits
4598                         ),
4599                     };
4600                     let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
4601                     ctx.emit(shift);
4602 
4603                     // Apply shifted mask (XOR or AND).
4604                     let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
4605                     ctx.emit(mask);
4606                 } else {
4607                     panic!("unexpected type {:?} for Fabs", output_ty);
4608                 }
4609             }
4610         }
4611 
4612         Opcode::Fcopysign => {
4613             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4614             let lhs = put_input_in_reg(ctx, inputs[0]);
4615             let rhs = put_input_in_reg(ctx, inputs[1]);
4616 
4617             let ty = ty.unwrap();
4618 
4619             // We're going to generate the following sequence:
4620             //
4621             // movabs     $INT_MIN, tmp_gpr1
4622             // mov{d,q}   tmp_gpr1, tmp_xmm1
4623             // movap{s,d} tmp_xmm1, dst
4624             // andnp{s,d} src_1, dst
4625             // movap{s,d} src_2, tmp_xmm2
4626             // andp{s,d}  tmp_xmm1, tmp_xmm2
4627             // orp{s,d}   tmp_xmm2, dst
4628 
4629             let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
4630             let tmp_xmm2 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
4631 
4632             let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
4633                 types::F32 => (
4634                     0x8000_0000,
4635                     SseOpcode::Movaps,
4636                     SseOpcode::Andnps,
4637                     SseOpcode::Andps,
4638                     SseOpcode::Orps,
4639                 ),
4640                 types::F64 => (
4641                     0x8000_0000_0000_0000,
4642                     SseOpcode::Movapd,
4643                     SseOpcode::Andnpd,
4644                     SseOpcode::Andpd,
4645                     SseOpcode::Orpd,
4646                 ),
4647                 _ => {
4648                     panic!("unexpected type {:?} for copysign", ty);
4649                 }
4650             };
4651 
4652             for inst in Inst::gen_constant(ValueRegs::one(tmp_xmm1), sign_bit_cst, ty, |ty| {
4653                 ctx.alloc_tmp(ty).only_reg().unwrap()
4654             }) {
4655                 ctx.emit(inst);
4656             }
4657             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
4658             ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
4659             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2));
4660             ctx.emit(Inst::xmm_rm_r(
4661                 and_op,
4662                 RegMem::reg(tmp_xmm1.to_reg()),
4663                 tmp_xmm2,
4664             ));
4665             ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
4666         }
4667 
4668         Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
4669             let ty = ty.unwrap();
4670             if isa_flags.use_sse41() {
4671                 let mode = match op {
4672                     Opcode::Ceil => RoundImm::RoundUp,
4673                     Opcode::Floor => RoundImm::RoundDown,
4674                     Opcode::Nearest => RoundImm::RoundNearest,
4675                     Opcode::Trunc => RoundImm::RoundZero,
4676                     _ => panic!("unexpected opcode {:?} in Ceil/Floor/Nearest/Trunc", op),
4677                 };
4678                 let op = match ty {
4679                     types::F32 => SseOpcode::Roundss,
4680                     types::F64 => SseOpcode::Roundsd,
4681                     types::F32X4 => SseOpcode::Roundps,
4682                     types::F64X2 => SseOpcode::Roundpd,
4683                     _ => panic!("unexpected type {:?} in Ceil/Floor/Nearest/Trunc", ty),
4684                 };
4685                 let src = input_to_reg_mem(ctx, inputs[0]);
4686                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4687                 ctx.emit(Inst::xmm_rm_r_imm(
4688                     op,
4689                     src,
4690                     dst,
4691                     mode.encode(),
4692                     OperandSize::Size32,
4693                 ));
4694             } else {
4695                 // Lower to VM calls when there's no access to SSE4.1.
4696                 // Note, for vector types on platforms that don't support sse41
4697                 // the execution will panic here.
4698                 let libcall = match (op, ty) {
4699                     (Opcode::Ceil, types::F32) => LibCall::CeilF32,
4700                     (Opcode::Ceil, types::F64) => LibCall::CeilF64,
4701                     (Opcode::Floor, types::F32) => LibCall::FloorF32,
4702                     (Opcode::Floor, types::F64) => LibCall::FloorF64,
4703                     (Opcode::Nearest, types::F32) => LibCall::NearestF32,
4704                     (Opcode::Nearest, types::F64) => LibCall::NearestF64,
4705                     (Opcode::Trunc, types::F32) => LibCall::TruncF32,
4706                     (Opcode::Trunc, types::F64) => LibCall::TruncF64,
4707                     _ => panic!(
4708                         "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
4709                         ty, op
4710                     ),
4711                 };
4712                 emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
4713             }
4714         }
4715 
4716         Opcode::Load
4717         | Opcode::Uload8
4718         | Opcode::Sload8
4719         | Opcode::Uload16
4720         | Opcode::Sload16
4721         | Opcode::Uload32
4722         | Opcode::Sload32
4723         | Opcode::LoadComplex
4724         | Opcode::Uload8Complex
4725         | Opcode::Sload8Complex
4726         | Opcode::Uload16Complex
4727         | Opcode::Sload16Complex
4728         | Opcode::Uload32Complex
4729         | Opcode::Sload32Complex
4730         | Opcode::Sload8x8
4731         | Opcode::Uload8x8
4732         | Opcode::Sload16x4
4733         | Opcode::Uload16x4
4734         | Opcode::Sload32x2
4735         | Opcode::Uload32x2 => {
4736             let offset = ctx.data(insn).load_store_offset().unwrap();
4737 
4738             let elem_ty = match op {
4739                 Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
4740                     types::I8
4741                 }
4742                 Opcode::Sload16
4743                 | Opcode::Uload16
4744                 | Opcode::Sload16Complex
4745                 | Opcode::Uload16Complex => types::I16,
4746                 Opcode::Sload32
4747                 | Opcode::Uload32
4748                 | Opcode::Sload32Complex
4749                 | Opcode::Uload32Complex => types::I32,
4750                 Opcode::Sload8x8
4751                 | Opcode::Uload8x8
4752                 | Opcode::Sload8x8Complex
4753                 | Opcode::Uload8x8Complex => types::I8X8,
4754                 Opcode::Sload16x4
4755                 | Opcode::Uload16x4
4756                 | Opcode::Sload16x4Complex
4757                 | Opcode::Uload16x4Complex => types::I16X4,
4758                 Opcode::Sload32x2
4759                 | Opcode::Uload32x2
4760                 | Opcode::Sload32x2Complex
4761                 | Opcode::Uload32x2Complex => types::I32X2,
4762                 Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
4763                 _ => unimplemented!(),
4764             };
4765 
4766             let ext_mode = ExtMode::new(elem_ty.bits(), 64);
4767 
4768             let sign_extend = match op {
4769                 Opcode::Sload8
4770                 | Opcode::Sload8Complex
4771                 | Opcode::Sload16
4772                 | Opcode::Sload16Complex
4773                 | Opcode::Sload32
4774                 | Opcode::Sload32Complex
4775                 | Opcode::Sload8x8
4776                 | Opcode::Sload8x8Complex
4777                 | Opcode::Sload16x4
4778                 | Opcode::Sload16x4Complex
4779                 | Opcode::Sload32x2
4780                 | Opcode::Sload32x2Complex => true,
4781                 _ => false,
4782             };
4783 
4784             let amode = match op {
4785                 Opcode::Load
4786                 | Opcode::Uload8
4787                 | Opcode::Sload8
4788                 | Opcode::Uload16
4789                 | Opcode::Sload16
4790                 | Opcode::Uload32
4791                 | Opcode::Sload32
4792                 | Opcode::Sload8x8
4793                 | Opcode::Uload8x8
4794                 | Opcode::Sload16x4
4795                 | Opcode::Uload16x4
4796                 | Opcode::Sload32x2
4797                 | Opcode::Uload32x2 => {
4798                     assert_eq!(inputs.len(), 1, "only one input for load operands");
4799                     lower_to_amode(ctx, inputs[0], offset)
4800                 }
4801 
4802                 Opcode::LoadComplex
4803                 | Opcode::Uload8Complex
4804                 | Opcode::Sload8Complex
4805                 | Opcode::Uload16Complex
4806                 | Opcode::Sload16Complex
4807                 | Opcode::Uload32Complex
4808                 | Opcode::Sload32Complex
4809                 | Opcode::Sload8x8Complex
4810                 | Opcode::Uload8x8Complex
4811                 | Opcode::Sload16x4Complex
4812                 | Opcode::Uload16x4Complex
4813                 | Opcode::Sload32x2Complex
4814                 | Opcode::Uload32x2Complex => {
4815                     assert_eq!(
4816                         inputs.len(),
4817                         2,
4818                         "can't handle more than two inputs in complex load"
4819                     );
4820                     let base = put_input_in_reg(ctx, inputs[0]);
4821                     let index = put_input_in_reg(ctx, inputs[1]);
4822                     let shift = 0;
4823                     let flags = ctx.memflags(insn).expect("load should have memflags");
4824                     Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
4825                 }
4826                 _ => unreachable!(),
4827             };
4828 
4829             if elem_ty == types::I128 {
4830                 let dsts = get_output_reg(ctx, outputs[0]);
4831                 ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
4832                 ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
4833             } else {
4834                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4835                 let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
4836                 match (sign_extend, is_xmm) {
4837                     (true, false) => {
4838                         // The load is sign-extended only when the output size is lower than 64 bits,
4839                         // so ext-mode is defined in this case.
4840                         ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
4841                     }
4842                     (false, false) => {
4843                         if elem_ty.bytes() == 8 {
4844                             // Use a plain load.
4845                             ctx.emit(Inst::mov64_m_r(amode, dst))
4846                         } else {
4847                             // Use a zero-extended load.
4848                             ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
4849                         }
4850                     }
4851                     (_, true) => {
4852                         ctx.emit(match elem_ty {
4853                             types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
4854                             types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
4855                             types::I8X8 => {
4856                                 if sign_extend == true {
4857                                     Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
4858                                 } else {
4859                                     Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
4860                                 }
4861                             }
4862                             types::I16X4 => {
4863                                 if sign_extend == true {
4864                                     Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
4865                                 } else {
4866                                     Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
4867                                 }
4868                             }
4869                             types::I32X2 => {
4870                                 if sign_extend == true {
4871                                     Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
4872                                 } else {
4873                                     Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
4874                                 }
4875                             }
4876                             _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
4877                                 Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
4878                             }
4879                             // TODO Specialize for different types: MOVUPD, MOVDQU
4880                             _ => unreachable!(
4881                                 "unexpected type for load: {:?} - {:?}",
4882                                 elem_ty,
4883                                 elem_ty.bits()
4884                             ),
4885                         });
4886                     }
4887                 }
4888             }
4889         }
4890 
4891         Opcode::Store
4892         | Opcode::Istore8
4893         | Opcode::Istore16
4894         | Opcode::Istore32
4895         | Opcode::StoreComplex
4896         | Opcode::Istore8Complex
4897         | Opcode::Istore16Complex
4898         | Opcode::Istore32Complex => {
4899             let offset = ctx.data(insn).load_store_offset().unwrap();
4900 
4901             let elem_ty = match op {
4902                 Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
4903                 Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
4904                 Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
4905                 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
4906                 _ => unreachable!(),
4907             };
4908 
4909             let addr = match op {
4910                 Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
4911                     assert_eq!(inputs.len(), 2, "only one input for store memory operands");
4912                     lower_to_amode(ctx, inputs[1], offset)
4913                 }
4914 
4915                 Opcode::StoreComplex
4916                 | Opcode::Istore8Complex
4917                 | Opcode::Istore16Complex
4918                 | Opcode::Istore32Complex => {
4919                     assert_eq!(
4920                         inputs.len(),
4921                         3,
4922                         "can't handle more than two inputs in complex store"
4923                     );
4924                     let base = put_input_in_reg(ctx, inputs[1]);
4925                     let index = put_input_in_reg(ctx, inputs[2]);
4926                     let shift = 0;
4927                     let flags = ctx.memflags(insn).expect("store should have memflags");
4928                     Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
4929                 }
4930 
4931                 _ => unreachable!(),
4932             };
4933 
4934             if elem_ty == types::I128 {
4935                 let srcs = put_input_in_regs(ctx, inputs[0]);
4936                 ctx.emit(Inst::store(types::I64, srcs.regs()[0], addr.clone()));
4937                 ctx.emit(Inst::store(types::I64, srcs.regs()[1], addr.offset(8)));
4938             } else {
4939                 let src = put_input_in_reg(ctx, inputs[0]);
4940                 ctx.emit(Inst::store(elem_ty, src, addr));
4941             }
4942         }
4943 
4944         Opcode::AtomicRmw => {
4945             // This is a simple, general-case atomic update, based on a loop involving
4946             // `cmpxchg`.  Note that we could do much better than this in the case where the old
4947             // value at the location (that is to say, the SSA `Value` computed by this CLIF
4948             // instruction) is not required.  In that case, we could instead implement this
4949             // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
4950             // the case where the old value is required, for the `add` and `sub` cases, we can
4951             // use the single instruction `lock xadd`.  However, those improvements have been
4952             // left for another day.
4953             // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
4954             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4955             let mut addr = put_input_in_reg(ctx, inputs[0]);
4956             let mut arg2 = put_input_in_reg(ctx, inputs[1]);
4957             let ty_access = ty.unwrap();
4958             assert!(is_valid_atomic_transaction_ty(ty_access));
4959 
4960             // Make sure that both args are in virtual regs, since in effect we have to do a
4961             // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
4962             // guaranteed safe if either is in a real reg.
4963             addr = ctx.ensure_in_vreg(addr, types::I64);
4964             arg2 = ctx.ensure_in_vreg(arg2, types::I64);
4965 
4966             // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
4967             // operates at whatever width is specified by `ty`, so there's no need to
4968             // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
4969             ctx.emit(Inst::gen_move(
4970                 Writable::from_reg(regs::r9()),
4971                 addr,
4972                 types::I64,
4973             ));
4974             ctx.emit(Inst::gen_move(
4975                 Writable::from_reg(regs::r10()),
4976                 arg2,
4977                 types::I64,
4978             ));
4979 
4980             // Now the AtomicRmwSeq (pseudo-) instruction itself
4981             let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
4982             ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op });
4983 
4984             // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
4985             ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
4986         }
4987 
4988         Opcode::AtomicCas => {
4989             // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
4990             // `AtomicRmw`, there's no need to zero-extend narrow values here.
4991             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4992             let addr = lower_to_amode(ctx, inputs[0], 0);
4993             let expected = put_input_in_reg(ctx, inputs[1]);
4994             let replacement = put_input_in_reg(ctx, inputs[2]);
4995             let ty_access = ty.unwrap();
4996             assert!(is_valid_atomic_transaction_ty(ty_access));
4997 
4998             // Move the expected value into %rax.  Because there's only one fixed register on
4999             // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
5000             // `AtomicRmw` case.
5001             ctx.emit(Inst::gen_move(
5002                 Writable::from_reg(regs::rax()),
5003                 expected,
5004                 types::I64,
5005             ));
5006             ctx.emit(Inst::LockCmpxchg {
5007                 ty: ty_access,
5008                 src: replacement,
5009                 dst: addr.into(),
5010             });
5011             // And finally, copy the old value at the location to its destination reg.
5012             ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5013         }
5014 
5015         Opcode::AtomicLoad => {
5016             // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
5017             // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
5018             // need for any fence instructions.
5019             let data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5020             let addr = lower_to_amode(ctx, inputs[0], 0);
5021             let ty_access = ty.unwrap();
5022             assert!(is_valid_atomic_transaction_ty(ty_access));
5023 
5024             let rm = RegMem::mem(addr);
5025             if ty_access == types::I64 {
5026                 ctx.emit(Inst::mov64_rm_r(rm, data));
5027             } else {
5028                 let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!(
5029                     "invalid extension during AtomicLoad: {} -> {}",
5030                     ty_access.bits(),
5031                     64
5032                 ));
5033                 ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data));
5034             }
5035         }
5036 
5037         Opcode::AtomicStore => {
5038             // This is a normal store, followed by an `mfence` instruction.
5039             let data = put_input_in_reg(ctx, inputs[0]);
5040             let addr = lower_to_amode(ctx, inputs[1], 0);
5041             let ty_access = ctx.input_ty(insn, 0);
5042             assert!(is_valid_atomic_transaction_ty(ty_access));
5043 
5044             ctx.emit(Inst::store(ty_access, data, addr));
5045             ctx.emit(Inst::Fence {
5046                 kind: FenceKind::MFence,
5047             });
5048         }
5049 
5050         Opcode::Fence => {
5051             ctx.emit(Inst::Fence {
5052                 kind: FenceKind::MFence,
5053             });
5054         }
5055 
5056         Opcode::FuncAddr => {
5057             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5058             let (extname, _) = ctx.call_target(insn).unwrap();
5059             let extname = extname.clone();
5060             ctx.emit(Inst::LoadExtName {
5061                 dst,
5062                 name: Box::new(extname),
5063                 offset: 0,
5064             });
5065         }
5066 
5067         Opcode::SymbolValue => {
5068             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5069             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
5070             let extname = extname.clone();
5071             ctx.emit(Inst::LoadExtName {
5072                 dst,
5073                 name: Box::new(extname),
5074                 offset,
5075             });
5076         }
5077 
5078         Opcode::StackAddr => {
5079             let (stack_slot, offset) = match *ctx.data(insn) {
5080                 InstructionData::StackLoad {
5081                     opcode: Opcode::StackAddr,
5082                     stack_slot,
5083                     offset,
5084                 } => (stack_slot, offset),
5085                 _ => unreachable!(),
5086             };
5087             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5088             let offset: i32 = offset.into();
5089             let inst = ctx
5090                 .abi()
5091                 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
5092             ctx.emit(inst);
5093         }
5094 
5095         Opcode::Select => {
5096             let flag_input = inputs[0];
5097             if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
5098                 let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
5099 
5100                 // For equal, we flip the operands, because we can't test a conjunction of
5101                 // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
5102                 let (lhs_input, rhs_input) = match cond_code {
5103                     FloatCC::Equal => (inputs[2], inputs[1]),
5104                     _ => (inputs[1], inputs[2]),
5105                 };
5106 
5107                 let ty = ctx.output_ty(insn, 0);
5108                 let rhs = put_input_in_regs(ctx, rhs_input);
5109                 let dst = get_output_reg(ctx, outputs[0]);
5110                 let lhs = put_input_in_regs(ctx, lhs_input);
5111 
5112                 // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
5113                 // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
5114                 // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
5115                 // select operation, and invert the equal to a not-equal here.
5116                 let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
5117 
5118                 if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
5119                     // Keep this sync'd with the lowering of the select inputs above.
5120                     assert_eq!(cond_code, FloatCC::Equal);
5121                 }
5122 
5123                 emit_moves(ctx, dst, rhs, ty);
5124 
5125                 let operand_size = if ty == types::F64 {
5126                     OperandSize::Size64
5127                 } else {
5128                     OperandSize::Size32
5129                 };
5130                 match fcmp_results {
5131                     FcmpCondResult::Condition(cc) => {
5132                         if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 {
5133                             let size = ty.bytes() as u8;
5134                             emit_cmoves(ctx, size, cc, lhs, dst);
5135                         } else {
5136                             ctx.emit(Inst::xmm_cmove(
5137                                 operand_size,
5138                                 cc,
5139                                 RegMem::reg(lhs.only_reg().unwrap()),
5140                                 dst.only_reg().unwrap(),
5141                             ));
5142                         }
5143                     }
5144                     FcmpCondResult::AndConditions(_, _) => {
5145                         unreachable!(
5146                             "can't AND with select; see above comment about inverting equal"
5147                         );
5148                     }
5149                     FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
5150                     | FcmpCondResult::OrConditions(cc1, cc2) => {
5151                         if is_int_or_ref_ty(ty) || ty == types::I128 {
5152                             let size = ty.bytes() as u8;
5153                             emit_cmoves(ctx, size, cc1, lhs.clone(), dst);
5154                             emit_cmoves(ctx, size, cc2, lhs, dst);
5155                         } else {
5156                             ctx.emit(Inst::xmm_cmove(
5157                                 operand_size,
5158                                 cc1,
5159                                 RegMem::reg(lhs.only_reg().unwrap()),
5160                                 dst.only_reg().unwrap(),
5161                             ));
5162                             ctx.emit(Inst::xmm_cmove(
5163                                 operand_size,
5164                                 cc2,
5165                                 RegMem::reg(lhs.only_reg().unwrap()),
5166                                 dst.only_reg().unwrap(),
5167                             ));
5168                         }
5169                     }
5170                 }
5171             } else {
5172                 let ty = ty.unwrap();
5173 
5174                 let size = ty.bytes() as u8;
5175                 let lhs = put_input_in_regs(ctx, inputs[1]);
5176                 let rhs = put_input_in_regs(ctx, inputs[2]);
5177                 let dst = get_output_reg(ctx, outputs[0]);
5178 
5179                 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
5180                     let cond_code = ctx.data(icmp).cond_code().unwrap();
5181                     let cond_code = emit_cmp(ctx, icmp, cond_code);
5182                     CC::from_intcc(cond_code)
5183                 } else {
5184                     let sel_ty = ctx.input_ty(insn, 0);
5185                     let size = OperandSize::from_ty(ctx.input_ty(insn, 0));
5186                     let test = put_input_in_reg(ctx, flag_input);
5187                     let test_input = if sel_ty == types::B1 {
5188                         // The input is a boolean value; test the LSB for nonzero with:
5189                         //     test reg, 1
5190                         RegMemImm::imm(1)
5191                     } else {
5192                         // The input is an integer; test the whole value for
5193                         // nonzero with:
5194                         //     test reg, reg
5195                         //
5196                         // (It doesn't make sense to have a boolean wider than
5197                         // one bit here -- which bit would cause us to select an
5198                         // input?)
5199                         assert!(!is_bool_ty(sel_ty));
5200                         RegMemImm::reg(test)
5201                     };
5202                     ctx.emit(Inst::test_rmi_r(size, test_input, test));
5203                     CC::NZ
5204                 };
5205 
5206                 // This doesn't affect the flags.
5207                 emit_moves(ctx, dst, rhs, ty);
5208 
5209                 if is_int_or_ref_ty(ty) || ty == types::I128 {
5210                     emit_cmoves(ctx, size, cc, lhs, dst);
5211                 } else {
5212                     debug_assert!(ty == types::F32 || ty == types::F64);
5213                     ctx.emit(Inst::xmm_cmove(
5214                         if ty == types::F64 {
5215                             OperandSize::Size64
5216                         } else {
5217                             OperandSize::Size32
5218                         },
5219                         cc,
5220                         RegMem::reg(lhs.only_reg().unwrap()),
5221                         dst.only_reg().unwrap(),
5222                     ));
5223                 }
5224             }
5225         }
5226 
5227         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
5228             let lhs = put_input_in_regs(ctx, inputs[1]);
5229             let rhs = put_input_in_regs(ctx, inputs[2]);
5230             let dst = get_output_reg(ctx, outputs[0]);
5231             let ty = ctx.output_ty(insn, 0);
5232 
5233             // Verification ensures that the input is always a single-def ifcmp.
5234             let cmp_insn = ctx
5235                 .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
5236                 .inst
5237                 .unwrap()
5238                 .0;
5239             debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
5240             let cond_code = ctx.data(insn).cond_code().unwrap();
5241             let cond_code = emit_cmp(ctx, cmp_insn, cond_code);
5242 
5243             let cc = CC::from_intcc(cond_code);
5244 
5245             if is_int_or_ref_ty(ty) || ty == types::I128 {
5246                 let size = ty.bytes() as u8;
5247                 emit_moves(ctx, dst, rhs, ty);
5248                 emit_cmoves(ctx, size, cc, lhs, dst);
5249             } else {
5250                 debug_assert!(ty == types::F32 || ty == types::F64);
5251                 emit_moves(ctx, dst, rhs, ty);
5252                 ctx.emit(Inst::xmm_cmove(
5253                     if ty == types::F64 {
5254                         OperandSize::Size64
5255                     } else {
5256                         OperandSize::Size32
5257                     },
5258                     cc,
5259                     RegMem::reg(lhs.only_reg().unwrap()),
5260                     dst.only_reg().unwrap(),
5261                 ));
5262             }
5263         }
5264 
5265         Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
5266             let kind = match op {
5267                 Opcode::Udiv => DivOrRemKind::UnsignedDiv,
5268                 Opcode::Sdiv => DivOrRemKind::SignedDiv,
5269                 Opcode::Urem => DivOrRemKind::UnsignedRem,
5270                 Opcode::Srem => DivOrRemKind::SignedRem,
5271                 _ => unreachable!(),
5272             };
5273             let is_div = kind.is_div();
5274 
5275             let input_ty = ctx.input_ty(insn, 0);
5276             let size = OperandSize::from_ty(input_ty);
5277 
5278             let dividend = put_input_in_reg(ctx, inputs[0]);
5279             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5280 
5281             ctx.emit(Inst::gen_move(
5282                 Writable::from_reg(regs::rax()),
5283                 dividend,
5284                 input_ty,
5285             ));
5286 
5287             // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
5288             if flags.avoid_div_traps() || op == Opcode::Srem {
5289                 // A vcode meta-instruction is used to lower the inline checks, since they embed
5290                 // pc-relative offsets that must not change, thus requiring regalloc to not
5291                 // interfere by introducing spills and reloads.
5292                 //
5293                 // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
5294                 // regalloc is aware of the coalescing opportunity between rax/rdx and the
5295                 // destination register.
5296                 let divisor = put_input_in_reg(ctx, inputs[1]);
5297 
5298                 let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5299                 ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
5300 
5301                 let tmp = if op == Opcode::Sdiv && size == OperandSize::Size64 {
5302                     Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
5303                 } else {
5304                     None
5305                 };
5306                 // TODO use xor
5307                 ctx.emit(Inst::imm(
5308                     OperandSize::Size32,
5309                     0,
5310                     Writable::from_reg(regs::rdx()),
5311                 ));
5312                 ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
5313             } else {
5314                 // We don't want more than one trap record for a single instruction,
5315                 // so let's not allow the "mem" case (load-op merging) here; force
5316                 // divisor into a register instead.
5317                 let divisor = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
5318 
5319                 // Fill in the high parts:
5320                 if kind.is_signed() {
5321                     // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
5322                     // signed opcodes.
5323                     ctx.emit(Inst::sign_extend_data(size));
5324                 } else if input_ty == types::I8 {
5325                     ctx.emit(Inst::movzx_rm_r(
5326                         ExtMode::BL,
5327                         RegMem::reg(regs::rax()),
5328                         Writable::from_reg(regs::rax()),
5329                     ));
5330                 } else {
5331                     // zero for unsigned opcodes.
5332                     ctx.emit(Inst::imm(
5333                         OperandSize::Size64,
5334                         0,
5335                         Writable::from_reg(regs::rdx()),
5336                     ));
5337                 }
5338 
5339                 // Emit the actual idiv.
5340                 ctx.emit(Inst::div(size, kind.is_signed(), divisor));
5341             }
5342 
5343             // Move the result back into the destination reg.
5344             if is_div {
5345                 // The quotient is in rax.
5346                 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
5347             } else {
5348                 if size == OperandSize::Size8 {
5349                     // The remainder is in AH. Right-shift by 8 bits then move from rax.
5350                     ctx.emit(Inst::shift_r(
5351                         OperandSize::Size64,
5352                         ShiftKind::ShiftRightLogical,
5353                         Some(8),
5354                         Writable::from_reg(regs::rax()),
5355                     ));
5356                     ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
5357                 } else {
5358                     // The remainder is in rdx.
5359                     ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
5360                 }
5361             }
5362         }
5363 
5364         Opcode::Umulhi | Opcode::Smulhi => {
5365             let input_ty = ctx.input_ty(insn, 0);
5366 
5367             let lhs = put_input_in_reg(ctx, inputs[0]);
5368             let rhs = input_to_reg_mem(ctx, inputs[1]);
5369             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5370 
5371             // Move lhs in %rax.
5372             ctx.emit(Inst::gen_move(
5373                 Writable::from_reg(regs::rax()),
5374                 lhs,
5375                 input_ty,
5376             ));
5377 
5378             // Emit the actual mul or imul.
5379             let signed = op == Opcode::Smulhi;
5380             ctx.emit(Inst::mul_hi(OperandSize::from_ty(input_ty), signed, rhs));
5381 
5382             // Read the result from the high part (stored in %rdx).
5383             ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
5384         }
5385 
5386         Opcode::GetPinnedReg => {
5387             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5388             ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
5389         }
5390 
5391         Opcode::SetPinnedReg => {
5392             let src = put_input_in_reg(ctx, inputs[0]);
5393             ctx.emit(Inst::gen_move(
5394                 Writable::from_reg(regs::pinned_reg()),
5395                 src,
5396                 types::I64,
5397             ));
5398         }
5399 
5400         Opcode::Vconst => {
5401             let used_constant = if let &InstructionData::UnaryConst {
5402                 constant_handle, ..
5403             } = ctx.data(insn)
5404             {
5405                 ctx.use_constant(VCodeConstantData::Pool(
5406                     constant_handle,
5407                     ctx.get_constant_data(constant_handle).clone(),
5408                 ))
5409             } else {
5410                 unreachable!("vconst should always have unary_const format")
5411             };
5412             // TODO use Inst::gen_constant() instead.
5413             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5414             let ty = ty.unwrap();
5415             ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
5416         }
5417 
5418         Opcode::RawBitcast => {
5419             // A raw_bitcast is just a mechanism for correcting the type of V128 values (see
5420             // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
5421             // instruction should emit no machine code but a move is necessary to give the register
5422             // allocator a definition for the output virtual register.
5423             let src = put_input_in_reg(ctx, inputs[0]);
5424             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5425             let ty = ty.unwrap();
5426             ctx.emit(Inst::gen_move(dst, src, ty));
5427         }
5428 
5429         Opcode::Shuffle => {
5430             let ty = ty.unwrap();
5431             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5432             let lhs_ty = ctx.input_ty(insn, 0);
5433             let lhs = put_input_in_reg(ctx, inputs[0]);
5434             let rhs = put_input_in_reg(ctx, inputs[1]);
5435             let mask = match ctx.get_immediate(insn) {
5436                 Some(DataValue::V128(bytes)) => bytes.to_vec(),
5437                 _ => unreachable!("shuffle should always have a 16-byte immediate"),
5438             };
5439 
5440             // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
5441             // 1 in the most significant position zeroes the lane.
5442             let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
5443 
5444             ctx.emit(Inst::gen_move(dst, rhs, ty));
5445             if rhs == lhs {
5446                 // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
5447                 // register. We statically build `constructed_mask` to zero out any unknown lane
5448                 // indices (may not be completely necessary: verification could fail incorrect mask
5449                 // values) and fix the indexes to all point to the `dst` vector.
5450                 let constructed_mask = mask
5451                     .iter()
5452                     // If the mask is greater than 15 it still may be referring to a lane in b.
5453                     .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
5454                     .map(zero_unknown_lane_index)
5455                     .collect();
5456                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5457                 let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5458                 ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
5459                 // After loading the constructed mask in a temporary register, we use this to
5460                 // shuffle the `dst` register (remember that, in this case, it is the same as
5461                 // `src` so we disregard this register).
5462                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
5463             } else {
5464                 // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
5465                 // them together. This is necessary due to PSHUFB semantics. As in the case above,
5466                 // we build the `constructed_mask` for each case statically.
5467 
5468                 // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
5469                 let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
5470                 ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
5471                 let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
5472                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5473                 let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5474                 ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
5475                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
5476 
5477                 // PSHUFB the second argument, placing zeroes for unused lanes.
5478                 let constructed_mask = mask
5479                     .iter()
5480                     .map(|b| b.wrapping_sub(16))
5481                     .map(zero_unknown_lane_index)
5482                     .collect();
5483                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5484                 let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5485                 ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
5486                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
5487 
5488                 // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
5489                 // is not important).
5490                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
5491 
5492                 // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
5493             }
5494         }
5495 
5496         Opcode::Swizzle => {
5497             // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
5498             // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
5499             // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
5500             // semantics match the Wasm SIMD semantics for this instruction.
5501             // The instruction format maps to variables like: %dst = swizzle %src, %mask
5502             let ty = ty.unwrap();
5503             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5504             let src = put_input_in_reg(ctx, inputs[0]);
5505             let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
5506 
5507             // Inform the register allocator that `src` and `dst` should be in the same register.
5508             ctx.emit(Inst::gen_move(dst, src, ty));
5509 
5510             // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
5511             let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5512             static ZERO_MASK_VALUE: [u8; 16] = [
5513                 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
5514                 0x70, 0x70,
5515             ];
5516             let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
5517             ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
5518 
5519             // Use the `zero_mask` on a writable `swizzle_mask`.
5520             let swizzle_mask = Writable::from_reg(swizzle_mask);
5521             ctx.emit(Inst::xmm_rm_r(
5522                 SseOpcode::Paddusb,
5523                 RegMem::from(zero_mask),
5524                 swizzle_mask,
5525             ));
5526 
5527             // Shuffle `dst` using the fixed-up `swizzle_mask`.
5528             ctx.emit(Inst::xmm_rm_r(
5529                 SseOpcode::Pshufb,
5530                 RegMem::from(swizzle_mask),
5531                 dst,
5532             ));
5533         }
5534 
5535         Opcode::Insertlane => {
5536             // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
5537             let ty = ty.unwrap();
5538             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5539             let in_vec = put_input_in_reg(ctx, inputs[0]);
5540             let src_ty = ctx.input_ty(insn, 1);
5541             debug_assert!(!src_ty.is_vector());
5542             let src = input_to_reg_mem(ctx, inputs[1]);
5543             let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
5544                 *imm
5545             } else {
5546                 unreachable!();
5547             };
5548             debug_assert!(lane < ty.lane_count() as u8);
5549 
5550             ctx.emit(Inst::gen_move(dst, in_vec, ty));
5551             emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
5552         }
5553 
5554         Opcode::Extractlane => {
5555             // The instruction format maps to variables like: %dst = extractlane %src, %lane
5556             let ty = ty.unwrap();
5557             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5558             let src_ty = ctx.input_ty(insn, 0);
5559             assert_eq!(src_ty.bits(), 128);
5560             let src = put_input_in_reg(ctx, inputs[0]);
5561             let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
5562                 *imm
5563             } else {
5564                 unreachable!();
5565             };
5566             debug_assert!(lane < src_ty.lane_count() as u8);
5567 
5568             emit_extract_lane(ctx, src, dst, lane, ty);
5569         }
5570 
5571         Opcode::ScalarToVector => {
5572             // When moving a scalar value to a vector register, we must be handle several
5573             // situations:
5574             //  1. a scalar float is already in an XMM register, so we simply move it
5575             //  2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
5576             //     XMM register and zeroes the upper bits
5577             //  3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
5578             //     the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
5579             //     MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
5580             //     unused load.
5581             let src = input_to_reg_mem(ctx, inputs[0]);
5582             let src_ty = ctx.input_ty(insn, 0);
5583             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5584             let dst_ty = ty.unwrap();
5585             assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
5586             match src {
5587                 RegMem::Reg { reg } => {
5588                     if src_ty.is_float() {
5589                         // Case 1: when moving a scalar float, we simply move from one XMM register
5590                         // to another, expecting the register allocator to elide this. Here we
5591                         // assume that the upper bits of a scalar float have not been munged with
5592                         // (the same assumption the old backend makes).
5593                         ctx.emit(Inst::gen_move(dst, reg, dst_ty));
5594                     } else {
5595                         // Case 2: when moving a scalar value of any other type, use MOVD to zero
5596                         // the upper lanes.
5597                         let src_size = match src_ty.bits() {
5598                             32 => OperandSize::Size32,
5599                             64 => OperandSize::Size64,
5600                             _ => unimplemented!("invalid source size for type: {}", src_ty),
5601                         };
5602                         ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
5603                     }
5604                 }
5605                 RegMem::Mem { .. } => {
5606                     // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
5607                     // MOVSS/MOVSD instruction.
5608                     let opcode = match src_ty.bits() {
5609                         32 => SseOpcode::Movss,
5610                         64 => SseOpcode::Movsd,
5611                         _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
5612                     };
5613                     ctx.emit(Inst::xmm_mov(opcode, src, dst));
5614                 }
5615             }
5616         }
5617 
5618         Opcode::Splat => {
5619             let ty = ty.unwrap();
5620             assert_eq!(ty.bits(), 128);
5621             let src_ty = ctx.input_ty(insn, 0);
5622             assert!(src_ty.bits() < 128);
5623 
5624             let src = input_to_reg_mem(ctx, inputs[0]);
5625             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5626 
5627             // We know that splat will overwrite all of the lanes of `dst` but it takes several
5628             // instructions to do so. Because of the multiple instructions, there is no good way to
5629             // declare `dst` a `def` except with the following pseudo-instruction.
5630             ctx.emit(Inst::xmm_uninit_value(dst));
5631 
5632             // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
5633             // and VPBROADCAST*.
5634             match ty.lane_bits() {
5635                 8 => {
5636                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
5637                     // Initialize a register with all 0s.
5638                     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
5639                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
5640                     // Shuffle the lowest byte lane to all other lanes.
5641                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
5642                 }
5643                 16 => {
5644                     emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
5645                     emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
5646                     // Shuffle the lowest two lanes to all other lanes.
5647                     ctx.emit(Inst::xmm_rm_r_imm(
5648                         SseOpcode::Pshufd,
5649                         RegMem::from(dst),
5650                         dst,
5651                         0,
5652                         OperandSize::Size32,
5653                     ))
5654                 }
5655                 32 => {
5656                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
5657                     // Shuffle the lowest lane to all other lanes.
5658                     ctx.emit(Inst::xmm_rm_r_imm(
5659                         SseOpcode::Pshufd,
5660                         RegMem::from(dst),
5661                         dst,
5662                         0,
5663                         OperandSize::Size32,
5664                     ))
5665                 }
5666                 64 => {
5667                     emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
5668                     emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
5669                 }
5670                 _ => panic!("Invalid type to splat: {}", ty),
5671             }
5672         }
5673 
5674         Opcode::VanyTrue => {
5675             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5676             let src_ty = ctx.input_ty(insn, 0);
5677             assert_eq!(src_ty.bits(), 128);
5678             let src = put_input_in_reg(ctx, inputs[0]);
5679             // Set the ZF if the result is all zeroes.
5680             ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
5681             // If the ZF is not set, place a 1 in `dst`.
5682             ctx.emit(Inst::setcc(CC::NZ, dst));
5683         }
5684 
5685         Opcode::VallTrue => {
5686             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5687             let src_ty = ctx.input_ty(insn, 0);
5688             assert_eq!(src_ty.bits(), 128);
5689             let src = input_to_reg_mem(ctx, inputs[0]);
5690 
5691             let eq = |ty: Type| match ty.lane_bits() {
5692                 8 => SseOpcode::Pcmpeqb,
5693                 16 => SseOpcode::Pcmpeqw,
5694                 32 => SseOpcode::Pcmpeqd,
5695                 64 => SseOpcode::Pcmpeqq,
5696                 _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
5697             };
5698 
5699             // Initialize a register with all 0s.
5700             let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
5701             ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
5702             // Compare to see what lanes are filled with all 1s.
5703             ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
5704             // Set the ZF if the result is all zeroes.
5705             ctx.emit(Inst::xmm_cmp_rm_r(
5706                 SseOpcode::Ptest,
5707                 RegMem::from(tmp),
5708                 tmp.to_reg(),
5709             ));
5710             // If the ZF is set, place a 1 in `dst`.
5711             ctx.emit(Inst::setcc(CC::Z, dst));
5712         }
5713 
5714         Opcode::VhighBits => {
5715             let src = put_input_in_reg(ctx, inputs[0]);
5716             let src_ty = ctx.input_ty(insn, 0);
5717             debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
5718             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5719             debug_assert!(dst.to_reg().get_class() == RegClass::I64);
5720 
5721             // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
5722             // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
5723             // the instruction can access additional registers when used with a REX.R prefix. The
5724             // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
5725             // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
5726             // unnecessary (`OperandSize` is used for setting/clearing REX.W).
5727             let size = OperandSize::Size32;
5728 
5729             match src_ty {
5730                 types::I8X16 | types::B8X16 => {
5731                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
5732                 }
5733                 types::I32X4 | types::B32X4 | types::F32X4 => {
5734                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
5735                 }
5736                 types::I64X2 | types::B64X2 | types::F64X2 => {
5737                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
5738                 }
5739                 types::I16X8 | types::B16X8 => {
5740                     // There is no x86 instruction for extracting the high bit of 16-bit lanes so
5741                     // here we:
5742                     // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
5743                     //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
5744                     // - use PMOVMSKB to gather the high bits; now we have duplicates, though
5745                     // - shift away the bottom 8 high bits to remove the duplicates.
5746                     let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
5747                     ctx.emit(Inst::gen_move(tmp, src, src_ty));
5748                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
5749                     ctx.emit(Inst::xmm_to_gpr(
5750                         SseOpcode::Pmovmskb,
5751                         tmp.to_reg(),
5752                         dst,
5753                         size,
5754                     ));
5755                     ctx.emit(Inst::shift_r(
5756                         OperandSize::Size64,
5757                         ShiftKind::ShiftRightLogical,
5758                         Some(8),
5759                         dst,
5760                     ));
5761                 }
5762                 _ => unimplemented!("unknown input type {} for {}", src_ty, op),
5763             }
5764         }
5765 
5766         Opcode::Iconcat => {
5767             let ty = ctx.output_ty(insn, 0);
5768             assert_eq!(
5769                 ty,
5770                 types::I128,
5771                 "Iconcat not expected to be used for non-128-bit type"
5772             );
5773             assert_eq!(ctx.input_ty(insn, 0), types::I64);
5774             assert_eq!(ctx.input_ty(insn, 1), types::I64);
5775             let lo = put_input_in_reg(ctx, inputs[0]);
5776             let hi = put_input_in_reg(ctx, inputs[1]);
5777             let dst = get_output_reg(ctx, outputs[0]);
5778             ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
5779             ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
5780         }
5781 
5782         Opcode::Isplit => {
5783             let ty = ctx.input_ty(insn, 0);
5784             assert_eq!(
5785                 ty,
5786                 types::I128,
5787                 "Iconcat not expected to be used for non-128-bit type"
5788             );
5789             assert_eq!(ctx.output_ty(insn, 0), types::I64);
5790             assert_eq!(ctx.output_ty(insn, 1), types::I64);
5791             let src = put_input_in_regs(ctx, inputs[0]);
5792             let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5793             let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
5794             ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
5795             ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
5796         }
5797 
5798         Opcode::TlsValue => match flags.tls_model() {
5799             TlsModel::ElfGd => {
5800                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5801                 let (name, _, _) = ctx.symbol_value(insn).unwrap();
5802                 let symbol = name.clone();
5803                 ctx.emit(Inst::ElfTlsGetAddr { symbol });
5804                 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5805             }
5806             TlsModel::Macho => {
5807                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5808                 let (name, _, _) = ctx.symbol_value(insn).unwrap();
5809                 let symbol = name.clone();
5810                 ctx.emit(Inst::MachOTlsGetAddr { symbol });
5811                 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5812             }
5813             _ => {
5814                 todo!(
5815                     "Unimplemented TLS model in x64 backend: {:?}",
5816                     flags.tls_model()
5817                 );
5818             }
5819         },
5820 
5821         Opcode::IaddImm
5822         | Opcode::ImulImm
5823         | Opcode::UdivImm
5824         | Opcode::SdivImm
5825         | Opcode::UremImm
5826         | Opcode::SremImm
5827         | Opcode::IrsubImm
5828         | Opcode::IaddCin
5829         | Opcode::IaddIfcin
5830         | Opcode::IaddCout
5831         | Opcode::IaddCarry
5832         | Opcode::IaddIfcarry
5833         | Opcode::IsubBin
5834         | Opcode::IsubIfbin
5835         | Opcode::IsubBout
5836         | Opcode::IsubIfbout
5837         | Opcode::IsubBorrow
5838         | Opcode::IsubIfborrow
5839         | Opcode::BandImm
5840         | Opcode::BorImm
5841         | Opcode::BxorImm
5842         | Opcode::RotlImm
5843         | Opcode::RotrImm
5844         | Opcode::IshlImm
5845         | Opcode::UshrImm
5846         | Opcode::SshrImm => {
5847             panic!("ALU+imm and ALU+carry ops should not appear here!");
5848         }
5849         _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
5850     }
5851 
5852     Ok(())
5853 }
5854 
5855 //=============================================================================
5856 // Lowering-backend trait implementation.
5857 
5858 impl LowerBackend for X64Backend {
5859     type MInst = Inst;
5860 
lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()>5861     fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
5862         lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
5863     }
5864 
lower_branch_group<C: LowerCtx<I = Inst>>( &self, ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>5865     fn lower_branch_group<C: LowerCtx<I = Inst>>(
5866         &self,
5867         ctx: &mut C,
5868         branches: &[IRInst],
5869         targets: &[MachLabel],
5870     ) -> CodegenResult<()> {
5871         // A block should end with at most two branches. The first may be a
5872         // conditional branch; a conditional branch can be followed only by an
5873         // unconditional branch or fallthrough. Otherwise, if only one branch,
5874         // it may be an unconditional branch, a fallthrough, a return, or a
5875         // trap. These conditions are verified by `is_ebb_basic()` during the
5876         // verifier pass.
5877         assert!(branches.len() <= 2);
5878 
5879         if branches.len() == 2 {
5880             // Must be a conditional branch followed by an unconditional branch.
5881             let op0 = ctx.data(branches[0]).opcode();
5882             let op1 = ctx.data(branches[1]).opcode();
5883 
5884             trace!(
5885                 "lowering two-branch group: opcodes are {:?} and {:?}",
5886                 op0,
5887                 op1
5888             );
5889             assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
5890 
5891             let taken = targets[0];
5892             // not_taken target is the target of the second branch, even if it is a Fallthrough
5893             // instruction: because we reorder blocks while we lower, the fallthrough in the new
5894             // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
5895             // explicitly-provided target.
5896             let not_taken = targets[1];
5897 
5898             match op0 {
5899                 Opcode::Brz | Opcode::Brnz => {
5900                     let flag_input = InsnInput {
5901                         insn: branches[0],
5902                         input: 0,
5903                     };
5904 
5905                     let src_ty = ctx.input_ty(branches[0], 0);
5906 
5907                     if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
5908                         let cond_code = ctx.data(icmp).cond_code().unwrap();
5909                         let cond_code = emit_cmp(ctx, icmp, cond_code);
5910 
5911                         let cond_code = if op0 == Opcode::Brz {
5912                             cond_code.inverse()
5913                         } else {
5914                             cond_code
5915                         };
5916 
5917                         let cc = CC::from_intcc(cond_code);
5918                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
5919                     } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
5920                         let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
5921                         let cond_code = if op0 == Opcode::Brz {
5922                             cond_code.inverse()
5923                         } else {
5924                             cond_code
5925                         };
5926                         match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
5927                             FcmpCondResult::Condition(cc) => {
5928                                 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
5929                             }
5930                             FcmpCondResult::AndConditions(cc1, cc2) => {
5931                                 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
5932                                 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
5933                             }
5934                             FcmpCondResult::OrConditions(cc1, cc2) => {
5935                                 ctx.emit(Inst::jmp_if(cc1, taken));
5936                                 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
5937                             }
5938                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
5939                         }
5940                     } else if src_ty == types::I128 {
5941                         let src = put_input_in_regs(
5942                             ctx,
5943                             InsnInput {
5944                                 insn: branches[0],
5945                                 input: 0,
5946                             },
5947                         );
5948                         let (half_cc, comb_op) = match op0 {
5949                             Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
5950                             Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
5951                             _ => unreachable!(),
5952                         };
5953                         let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5954                         let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5955                         ctx.emit(Inst::cmp_rmi_r(
5956                             OperandSize::Size64,
5957                             RegMemImm::imm(0),
5958                             src.regs()[0],
5959                         ));
5960                         ctx.emit(Inst::setcc(half_cc, tmp1));
5961                         ctx.emit(Inst::cmp_rmi_r(
5962                             OperandSize::Size64,
5963                             RegMemImm::imm(0),
5964                             src.regs()[1],
5965                         ));
5966                         ctx.emit(Inst::setcc(half_cc, tmp2));
5967                         ctx.emit(Inst::alu_rmi_r(
5968                             OperandSize::Size32,
5969                             comb_op,
5970                             RegMemImm::reg(tmp1.to_reg()),
5971                             tmp2,
5972                         ));
5973                         ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
5974                     } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
5975                         let src = put_input_in_reg(
5976                             ctx,
5977                             InsnInput {
5978                                 insn: branches[0],
5979                                 input: 0,
5980                             },
5981                         );
5982                         let cc = match op0 {
5983                             Opcode::Brz => CC::Z,
5984                             Opcode::Brnz => CC::NZ,
5985                             _ => unreachable!(),
5986                         };
5987                         // See case for `Opcode::Select` above re: testing the
5988                         // boolean input.
5989                         let test_input = if src_ty == types::B1 {
5990                             // test src, 1
5991                             RegMemImm::imm(1)
5992                         } else {
5993                             assert!(!is_bool_ty(src_ty));
5994                             // test src, src
5995                             RegMemImm::reg(src)
5996                         };
5997 
5998                         ctx.emit(Inst::test_rmi_r(
5999                             OperandSize::from_ty(src_ty),
6000                             test_input,
6001                             src,
6002                         ));
6003                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6004                     } else {
6005                         unimplemented!("brz/brnz with non-int type {:?}", src_ty);
6006                     }
6007                 }
6008 
6009                 Opcode::BrIcmp => {
6010                     let src_ty = ctx.input_ty(branches[0], 0);
6011                     if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
6012                         let lhs = put_input_in_reg(
6013                             ctx,
6014                             InsnInput {
6015                                 insn: branches[0],
6016                                 input: 0,
6017                             },
6018                         );
6019                         let rhs = input_to_reg_mem_imm(
6020                             ctx,
6021                             InsnInput {
6022                                 insn: branches[0],
6023                                 input: 1,
6024                             },
6025                         );
6026                         let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap());
6027                         // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
6028                         // us dst - src at the machine instruction level, so invert operands.
6029                         ctx.emit(Inst::cmp_rmi_r(OperandSize::from_ty(src_ty), rhs, lhs));
6030                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6031                     } else {
6032                         unimplemented!("bricmp with non-int type {:?}", src_ty);
6033                     }
6034                 }
6035 
6036                 Opcode::Brif => {
6037                     let flag_input = InsnInput {
6038                         insn: branches[0],
6039                         input: 0,
6040                     };
6041 
6042                     if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) {
6043                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
6044                         let cond_code = emit_cmp(ctx, ifcmp, cond_code);
6045                         let cc = CC::from_intcc(cond_code);
6046                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6047                     } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) {
6048                         let operand = put_input_in_reg(
6049                             ctx,
6050                             InsnInput {
6051                                 insn: ifcmp_sp,
6052                                 input: 0,
6053                             },
6054                         );
6055                         let ty = ctx.input_ty(ifcmp_sp, 0);
6056                         ctx.emit(Inst::cmp_rmi_r(
6057                             OperandSize::from_ty(ty),
6058                             RegMemImm::reg(regs::rsp()),
6059                             operand,
6060                         ));
6061                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
6062                         let cc = CC::from_intcc(cond_code);
6063                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6064                     } else {
6065                         // Should be disallowed by flags checks in verifier.
6066                         unimplemented!("Brif with non-ifcmp input");
6067                     }
6068                 }
6069                 Opcode::Brff => {
6070                     let flag_input = InsnInput {
6071                         insn: branches[0],
6072                         input: 0,
6073                     };
6074 
6075                     if let Some(ffcmp) = matches_input(ctx, flag_input, Opcode::Ffcmp) {
6076                         let cond_code = ctx.data(branches[0]).fp_cond_code().unwrap();
6077                         match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
6078                             FcmpCondResult::Condition(cc) => {
6079                                 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6080                             }
6081                             FcmpCondResult::AndConditions(cc1, cc2) => {
6082                                 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
6083                                 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
6084                             }
6085                             FcmpCondResult::OrConditions(cc1, cc2) => {
6086                                 ctx.emit(Inst::jmp_if(cc1, taken));
6087                                 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
6088                             }
6089                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
6090                         }
6091                     } else {
6092                         // Should be disallowed by flags checks in verifier.
6093                         unimplemented!("Brff with input not from ffcmp");
6094                     }
6095                 }
6096 
6097                 _ => panic!("unexpected branch opcode: {:?}", op0),
6098             }
6099         } else {
6100             assert_eq!(branches.len(), 1);
6101 
6102             // Must be an unconditional branch or trap.
6103             let op = ctx.data(branches[0]).opcode();
6104             match op {
6105                 Opcode::Jump | Opcode::Fallthrough => {
6106                     ctx.emit(Inst::jmp_known(targets[0]));
6107                 }
6108 
6109                 Opcode::BrTable => {
6110                     let jt_size = targets.len() - 1;
6111                     assert!(jt_size <= u32::max_value() as usize);
6112                     let jt_size = jt_size as u32;
6113 
6114                     let idx = extend_input_to_reg(
6115                         ctx,
6116                         InsnInput {
6117                             insn: branches[0],
6118                             input: 0,
6119                         },
6120                         ExtSpec::ZeroExtendTo32,
6121                     );
6122 
6123                     // Bounds-check (compute flags from idx - jt_size) and branch to default.
6124                     ctx.emit(Inst::cmp_rmi_r(
6125                         OperandSize::Size32,
6126                         RegMemImm::imm(jt_size),
6127                         idx,
6128                     ));
6129 
6130                     // Emit the compound instruction that does:
6131                     //
6132                     // lea $jt, %rA
6133                     // movsbl [%rA, %rIndex, 2], %rB
6134                     // add %rB, %rA
6135                     // j *%rA
6136                     // [jt entries]
6137                     //
6138                     // This must be *one* instruction in the vcode because we cannot allow regalloc
6139                     // to insert any spills/fills in the middle of the sequence; otherwise, the
6140                     // lea PC-rel offset to the jumptable would be incorrect.  (The alternative
6141                     // is to introduce a relocation pass for inlined jumptables, which is much
6142                     // worse.)
6143 
6144                     // This temporary is used as a signed integer of 64-bits (to hold addresses).
6145                     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
6146                     // This temporary is used as a signed integer of 32-bits (for the wasm-table
6147                     // index) and then 64-bits (address addend). The small lie about the I64 type
6148                     // is benign, since the temporary is dead after this instruction (and its
6149                     // Cranelift type is thus unused).
6150                     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
6151 
6152                     let targets_for_term: Vec<MachLabel> = targets.to_vec();
6153                     let default_target = targets[0];
6154 
6155                     let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect();
6156 
6157                     ctx.emit(Inst::JmpTableSeq {
6158                         idx,
6159                         tmp1,
6160                         tmp2,
6161                         default_target,
6162                         targets: jt_targets,
6163                         targets_for_term,
6164                     });
6165                 }
6166 
6167                 _ => panic!("Unknown branch type {:?}", op),
6168             }
6169         }
6170 
6171         Ok(())
6172     }
6173 
maybe_pinned_reg(&self) -> Option<Reg>6174     fn maybe_pinned_reg(&self) -> Option<Reg> {
6175         Some(regs::pinned_reg())
6176     }
6177 }
6178