1 //! Lowering rules for X64.
2
3 use crate::data_value::DataValue;
4 use crate::ir::{
5 condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
6 Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
7 };
8 use crate::isa::x64::abi::*;
9 use crate::isa::x64::inst::args::*;
10 use crate::isa::x64::inst::*;
11 use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
12 use crate::machinst::lower::*;
13 use crate::machinst::*;
14 use crate::result::CodegenResult;
15 use crate::settings::{Flags, TlsModel};
16 use alloc::boxed::Box;
17 use alloc::vec::Vec;
18 use cranelift_codegen_shared::condcodes::CondCode;
19 use log::trace;
20 use regalloc::{Reg, RegClass, Writable};
21 use smallvec::{smallvec, SmallVec};
22 use std::convert::TryFrom;
23 use target_lexicon::Triple;
24
25 //=============================================================================
26 // Helpers for instruction lowering.
27
is_int_or_ref_ty(ty: Type) -> bool28 fn is_int_or_ref_ty(ty: Type) -> bool {
29 match ty {
30 types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
31 types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
32 types::R32 => panic!("shouldn't have 32-bits refs on x64"),
33 _ => false,
34 }
35 }
36
is_bool_ty(ty: Type) -> bool37 fn is_bool_ty(ty: Type) -> bool {
38 match ty {
39 types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
40 types::R32 => panic!("shouldn't have 32-bits refs on x64"),
41 _ => false,
42 }
43 }
44
45 /// This is target-word-size dependent. And it excludes booleans and reftypes.
is_valid_atomic_transaction_ty(ty: Type) -> bool46 fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
47 match ty {
48 types::I8 | types::I16 | types::I32 | types::I64 => true,
49 _ => false,
50 }
51 }
52
53 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
54 /// `op`.
55 // TODO investigate failures with checking against the result index.
matches_input<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, op: Opcode, ) -> Option<IRInst>56 fn matches_input<C: LowerCtx<I = Inst>>(
57 ctx: &mut C,
58 input: InsnInput,
59 op: Opcode,
60 ) -> Option<IRInst> {
61 let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
62 inputs.inst.and_then(|(src_inst, _)| {
63 let data = ctx.data(src_inst);
64 if data.opcode() == op {
65 return Some(src_inst);
66 }
67 None
68 })
69 }
70
71 /// Returns whether the given specified `input` is a result produced by an instruction with any of
72 /// the opcodes specified in `ops`.
matches_input_any<C: LowerCtx<I = Inst>>( ctx: &mut C, input: InsnInput, ops: &[Opcode], ) -> Option<IRInst>73 fn matches_input_any<C: LowerCtx<I = Inst>>(
74 ctx: &mut C,
75 input: InsnInput,
76 ops: &[Opcode],
77 ) -> Option<IRInst> {
78 let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
79 inputs.inst.and_then(|(src_inst, _)| {
80 let data = ctx.data(src_inst);
81 for &op in ops {
82 if data.opcode() == op {
83 return Some(src_inst);
84 }
85 }
86 None
87 })
88 }
89
90 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
91 /// temporary register, returning that register.
generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg>92 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
93 let from_bits = ty_bits(ty);
94 let masked = if from_bits < 64 {
95 c & ((1u64 << from_bits) - 1)
96 } else {
97 c
98 };
99
100 let cst_copy = ctx.alloc_tmp(ty);
101 for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
102 ctx.alloc_tmp(ty).only_reg().unwrap()
103 })
104 .into_iter()
105 {
106 ctx.emit(inst);
107 }
108 non_writable_value_regs(cst_copy)
109 }
110
111 /// Put the given input into possibly multiple registers, and mark it as used (side-effect).
put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg>112 fn put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg> {
113 let ty = ctx.input_ty(spec.insn, spec.input);
114 let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
115
116 if let Some(c) = input.constant {
117 // Generate constants fresh at each use to minimize long-range register pressure.
118 generate_constant(ctx, ty, c)
119 } else {
120 ctx.put_input_in_regs(spec.insn, spec.input)
121 }
122 }
123
124 /// Put the given input into a register, and mark it as used (side-effect).
put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg125 fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
126 put_input_in_regs(ctx, spec)
127 .only_reg()
128 .expect("Multi-register value not expected")
129 }
130
131 /// Determines whether a load operation (indicated by `src_insn`) can be merged
132 /// into the current lowering point. If so, returns the address-base source (as
133 /// an `InsnInput`) and an offset from that address from which to perform the
134 /// load.
is_mergeable_load<C: LowerCtx<I = Inst>>( ctx: &mut C, src_insn: IRInst, ) -> Option<(InsnInput, i32)>135 fn is_mergeable_load<C: LowerCtx<I = Inst>>(
136 ctx: &mut C,
137 src_insn: IRInst,
138 ) -> Option<(InsnInput, i32)> {
139 let insn_data = ctx.data(src_insn);
140 let inputs = ctx.num_inputs(src_insn);
141 if inputs != 1 {
142 return None;
143 }
144
145 let load_ty = ctx.output_ty(src_insn, 0);
146 if ty_bits(load_ty) < 32 {
147 // Narrower values are handled by ALU insts that are at least 32 bits
148 // wide, which is normally OK as we ignore upper buts; but, if we
149 // generate, e.g., a direct-from-memory 32-bit add for a byte value and
150 // the byte is the last byte in a page, the extra data that we load is
151 // incorrectly accessed. So we only allow loads to merge for
152 // 32-bit-and-above widths.
153 return None;
154 }
155
156 // Just testing the opcode is enough, because the width will always match if
157 // the type does (and the type should match if the CLIF is properly
158 // constructed).
159 if insn_data.opcode() == Opcode::Load {
160 let offset = insn_data
161 .load_store_offset()
162 .expect("load should have offset");
163 Some((
164 InsnInput {
165 insn: src_insn,
166 input: 0,
167 },
168 offset,
169 ))
170 } else {
171 None
172 }
173 }
174
175 /// Put the given input into a register or a memory operand.
176 /// Effectful: may mark the given input as used, when returning the register form.
input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem177 fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
178 let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
179
180 if let Some(c) = inputs.constant {
181 // Generate constants fresh at each use to minimize long-range register pressure.
182 let ty = ctx.input_ty(spec.insn, spec.input);
183 return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
184 }
185
186 if let Some((src_insn, 0)) = inputs.inst {
187 if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
188 ctx.sink_inst(src_insn);
189 let amode = lower_to_amode(ctx, addr_input, offset);
190 return RegMem::mem(amode);
191 }
192 }
193
194 RegMem::reg(
195 ctx.put_input_in_regs(spec.insn, spec.input)
196 .only_reg()
197 .unwrap(),
198 )
199 }
200
201 /// An extension specification for `extend_input_to_reg`.
202 #[derive(Clone, Copy)]
203 enum ExtSpec {
204 ZeroExtendTo32,
205 ZeroExtendTo64,
206 SignExtendTo32,
207 #[allow(dead_code)] // not used just yet but may be used in the future!
208 SignExtendTo64,
209 }
210
211 /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
212 /// required. (This obviously causes side-effects.)
extend_input_to_reg<C: LowerCtx<I = Inst>>( ctx: &mut C, spec: InsnInput, ext_spec: ExtSpec, ) -> Reg213 fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
214 ctx: &mut C,
215 spec: InsnInput,
216 ext_spec: ExtSpec,
217 ) -> Reg {
218 let requested_size = match ext_spec {
219 ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
220 ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
221 };
222 let input_size = ctx.input_ty(spec.insn, spec.input).bits();
223
224 let requested_ty = if requested_size == 32 {
225 types::I32
226 } else {
227 types::I64
228 };
229
230 let ext_mode = match (input_size, requested_size) {
231 (a, b) if a == b => return put_input_in_reg(ctx, spec),
232 (1, 8) => return put_input_in_reg(ctx, spec),
233 (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)),
234 };
235
236 let src = input_to_reg_mem(ctx, spec);
237 let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
238 match ext_spec {
239 ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
240 ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
241 }
242 ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
243 ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
244 }
245 }
246 dst.to_reg()
247 }
248
249 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
250 /// possible side-effect.
non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>251 fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
252 input.constant.and_then(|x| {
253 // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
254 // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
255 // constant.
256 if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
257 Some(x as u32)
258 } else {
259 None
260 }
261 })
262 }
263
input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64>264 fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
265 ctx.get_input_as_source_or_const(spec.insn, spec.input)
266 .constant
267 }
268
269 /// Put the given input into an immediate, a register or a memory operand.
270 /// Effectful: may mark the given input as used, when returning the register form.
input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm271 fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
272 let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
273 let input_ty = ctx.input_ty(spec.insn, spec.input);
274 match non_reg_input_to_sext_imm(input, input_ty) {
275 Some(x) => RegMemImm::imm(x),
276 None => match input_to_reg_mem(ctx, spec) {
277 RegMem::Reg { reg } => RegMemImm::reg(reg),
278 RegMem::Mem { addr } => RegMemImm::mem(addr),
279 },
280 }
281 }
282
283 /// Emit an instruction to insert a value `src` into a lane of `dst`.
emit_insert_lane<C: LowerCtx<I = Inst>>( ctx: &mut C, src: RegMem, dst: Writable<Reg>, lane: u8, ty: Type, )284 fn emit_insert_lane<C: LowerCtx<I = Inst>>(
285 ctx: &mut C,
286 src: RegMem,
287 dst: Writable<Reg>,
288 lane: u8,
289 ty: Type,
290 ) {
291 if !ty.is_float() {
292 let (sse_op, size) = match ty.lane_bits() {
293 8 => (SseOpcode::Pinsrb, OperandSize::Size32),
294 16 => (SseOpcode::Pinsrw, OperandSize::Size32),
295 32 => (SseOpcode::Pinsrd, OperandSize::Size32),
296 64 => (SseOpcode::Pinsrd, OperandSize::Size64),
297 _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
298 };
299 ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
300 } else if ty == types::F32 {
301 let sse_op = SseOpcode::Insertps;
302 // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
303 // shifted into bits 5:6).
304 let lane = 0b00_00_00_00 | lane << 4;
305 ctx.emit(Inst::xmm_rm_r_imm(
306 sse_op,
307 src,
308 dst,
309 lane,
310 OperandSize::Size32,
311 ));
312 } else if ty == types::F64 {
313 let sse_op = match lane {
314 // Move the lowest quadword in replacement to vector without changing
315 // the upper bits.
316 0 => SseOpcode::Movsd,
317 // Move the low 64 bits of replacement vector to the high 64 bits of the
318 // vector.
319 1 => SseOpcode::Movlhps,
320 _ => unreachable!(),
321 };
322 // Here we use the `xmm_rm_r` encoding because it correctly tells the register
323 // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
324 // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
325 ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
326 } else {
327 panic!("unable to emit insertlane for type: {}", ty)
328 }
329 }
330
331 /// Emit an instruction to extract a lane of `src` into `dst`.
emit_extract_lane<C: LowerCtx<I = Inst>>( ctx: &mut C, src: Reg, dst: Writable<Reg>, lane: u8, ty: Type, )332 fn emit_extract_lane<C: LowerCtx<I = Inst>>(
333 ctx: &mut C,
334 src: Reg,
335 dst: Writable<Reg>,
336 lane: u8,
337 ty: Type,
338 ) {
339 if !ty.is_float() {
340 let (sse_op, size) = match ty.lane_bits() {
341 8 => (SseOpcode::Pextrb, OperandSize::Size32),
342 16 => (SseOpcode::Pextrw, OperandSize::Size32),
343 32 => (SseOpcode::Pextrd, OperandSize::Size32),
344 64 => (SseOpcode::Pextrd, OperandSize::Size64),
345 _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
346 };
347 let src = RegMem::reg(src);
348 ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
349 } else if ty == types::F32 || ty == types::F64 {
350 if lane == 0 {
351 // Remove the extractlane instruction, leaving the float where it is. The upper
352 // bits will remain unchanged; for correctness, this relies on Cranelift type
353 // checking to avoid using those bits.
354 ctx.emit(Inst::gen_move(dst, src, ty));
355 } else {
356 // Otherwise, shuffle the bits in `lane` to the lowest lane.
357 let sse_op = SseOpcode::Pshufd;
358 let mask = match ty {
359 // Move the value at `lane` to lane 0, copying existing value at lane 0 to
360 // other lanes. Again, this relies on Cranelift type checking to avoid
361 // using those bits.
362 types::F32 => {
363 assert!(lane > 0 && lane < 4);
364 0b00_00_00_00 | lane
365 }
366 // Move the value at `lane` 1 (we know it must be 1 because of the `if`
367 // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
368 // checking assumption also applies here.
369 types::F64 => {
370 assert!(lane == 1);
371 0b11_10_11_10
372 }
373 _ => unreachable!(),
374 };
375 let src = RegMem::reg(src);
376 ctx.emit(Inst::xmm_rm_r_imm(
377 sse_op,
378 src,
379 dst,
380 mask,
381 OperandSize::Size32,
382 ));
383 }
384 } else {
385 panic!("unable to emit extractlane for type: {}", ty)
386 }
387 }
388
389 /// Emits an int comparison instruction.
390 ///
391 /// Note: make sure that there are no instructions modifying the flags between a call to this
392 /// function and the use of the flags!
393 ///
394 /// Takes the condition code that will be tested, and returns
395 /// the condition code that should be used. This allows us to
396 /// synthesize comparisons out of multiple instructions for
397 /// special cases (e.g., 128-bit integers).
emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC398 fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC {
399 let ty = ctx.input_ty(insn, 0);
400
401 let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
402
403 if ty == types::I128 {
404 // We need to compare both halves and combine the results appropriately.
405 let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
406 let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
407 let lhs = put_input_in_regs(ctx, inputs[0]);
408 let lhs_lo = lhs.regs()[0];
409 let lhs_hi = lhs.regs()[1];
410 let rhs = put_input_in_regs(ctx, inputs[1]);
411 let rhs_lo = RegMemImm::reg(rhs.regs()[0]);
412 let rhs_hi = RegMemImm::reg(rhs.regs()[1]);
413 match cc {
414 IntCC::Equal => {
415 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
416 ctx.emit(Inst::setcc(CC::Z, cmp1));
417 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
418 ctx.emit(Inst::setcc(CC::Z, cmp2));
419 ctx.emit(Inst::alu_rmi_r(
420 OperandSize::Size64,
421 AluRmiROpcode::And,
422 RegMemImm::reg(cmp1.to_reg()),
423 cmp2,
424 ));
425 ctx.emit(Inst::alu_rmi_r(
426 OperandSize::Size64,
427 AluRmiROpcode::And,
428 RegMemImm::imm(1),
429 cmp2,
430 ));
431 IntCC::NotEqual
432 }
433 IntCC::NotEqual => {
434 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
435 ctx.emit(Inst::setcc(CC::NZ, cmp1));
436 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
437 ctx.emit(Inst::setcc(CC::NZ, cmp2));
438 ctx.emit(Inst::alu_rmi_r(
439 OperandSize::Size64,
440 AluRmiROpcode::Or,
441 RegMemImm::reg(cmp1.to_reg()),
442 cmp2,
443 ));
444 ctx.emit(Inst::alu_rmi_r(
445 OperandSize::Size64,
446 AluRmiROpcode::And,
447 RegMemImm::imm(1),
448 cmp2,
449 ));
450 IntCC::NotEqual
451 }
452 IntCC::SignedLessThan
453 | IntCC::SignedLessThanOrEqual
454 | IntCC::SignedGreaterThan
455 | IntCC::SignedGreaterThanOrEqual
456 | IntCC::UnsignedLessThan
457 | IntCC::UnsignedLessThanOrEqual
458 | IntCC::UnsignedGreaterThan
459 | IntCC::UnsignedGreaterThanOrEqual => {
460 // Result = (lhs_hi <> rhs_hi) ||
461 // (lhs_hi == rhs_hi && lhs_lo <> rhs_lo)
462 let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
463 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
464 ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1));
465 ctx.emit(Inst::setcc(CC::Z, cmp2));
466 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
467 ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3));
468 ctx.emit(Inst::alu_rmi_r(
469 OperandSize::Size64,
470 AluRmiROpcode::And,
471 RegMemImm::reg(cmp2.to_reg()),
472 cmp3,
473 ));
474 ctx.emit(Inst::alu_rmi_r(
475 OperandSize::Size64,
476 AluRmiROpcode::Or,
477 RegMemImm::reg(cmp1.to_reg()),
478 cmp3,
479 ));
480 ctx.emit(Inst::alu_rmi_r(
481 OperandSize::Size64,
482 AluRmiROpcode::And,
483 RegMemImm::imm(1),
484 cmp3,
485 ));
486 IntCC::NotEqual
487 }
488 _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc),
489 }
490 } else {
491 // TODO Try to commute the operands (and invert the condition) if one is an immediate.
492 let lhs = put_input_in_reg(ctx, inputs[0]);
493 // We force the RHS into a register, and disallow load-op fusion, because we
494 // do not have a transitive guarantee that this cmp-site will be the sole
495 // user of the value. Consider: the icmp might be the only user of a load,
496 // but there may be multiple users of the icmp (e.g. select or bint
497 // instructions) that each invoke `emit_cmp()`. If we were to allow a load
498 // to sink to the *latest* one, but other sites did not permit sinking, then
499 // we would be missing the load for other cmp-sites.
500 let rhs = put_input_in_reg(ctx, inputs[1]);
501
502 // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
503 // us dst - src at the machine instruction level, so invert operands.
504 ctx.emit(Inst::cmp_rmi_r(
505 OperandSize::from_ty(ty),
506 RegMemImm::reg(rhs),
507 lhs,
508 ));
509 cc
510 }
511 }
512
513 /// A specification for a fcmp emission.
514 enum FcmpSpec {
515 /// Normal flow.
516 Normal,
517
518 /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
519 /// happens with `InvertedEqualOrConditions`.
520 ///
521 /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
522 /// sequence of instructions) that check for an "AND" combination of condition codes; see for
523 /// instance lowering of Select.
524 InvertEqual,
525 }
526
527 /// This explains how to interpret the results of an fcmp instruction.
528 enum FcmpCondResult {
529 /// The given condition code must be set.
530 Condition(CC),
531
532 /// Both condition codes must be set.
533 AndConditions(CC, CC),
534
535 /// Either of the conditions codes must be set.
536 OrConditions(CC, CC),
537
538 /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
539 /// of the condition codes must be set, and the user must invert meaning of analyzing the
540 /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
541 /// reached.
542 InvertedEqualOrConditions(CC, CC),
543 }
544
545 /// Emits a float comparison instruction.
546 ///
547 /// Note: make sure that there are no instructions modifying the flags between a call to this
548 /// function and the use of the flags!
emit_fcmp<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec, ) -> FcmpCondResult549 fn emit_fcmp<C: LowerCtx<I = Inst>>(
550 ctx: &mut C,
551 insn: IRInst,
552 mut cond_code: FloatCC,
553 spec: FcmpSpec,
554 ) -> FcmpCondResult {
555 let (flip_operands, inverted_equal) = match cond_code {
556 FloatCC::LessThan
557 | FloatCC::LessThanOrEqual
558 | FloatCC::UnorderedOrGreaterThan
559 | FloatCC::UnorderedOrGreaterThanOrEqual => {
560 cond_code = cond_code.reverse();
561 (true, false)
562 }
563 FloatCC::Equal => {
564 let inverted_equal = match spec {
565 FcmpSpec::Normal => false,
566 FcmpSpec::InvertEqual => {
567 cond_code = FloatCC::NotEqual; // same as .inverse()
568 true
569 }
570 };
571 (false, inverted_equal)
572 }
573 _ => (false, false),
574 };
575
576 // The only valid CC constructed with `from_floatcc` can be put in the flag
577 // register with a direct float comparison; do this here.
578 let op = match ctx.input_ty(insn, 0) {
579 types::F32 => SseOpcode::Ucomiss,
580 types::F64 => SseOpcode::Ucomisd,
581 _ => panic!("Bad input type to Fcmp"),
582 };
583
584 let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
585 let (lhs_input, rhs_input) = if flip_operands {
586 (inputs[1], inputs[0])
587 } else {
588 (inputs[0], inputs[1])
589 };
590 let lhs = put_input_in_reg(ctx, lhs_input);
591 // See above in `emit_cmp()`. We must only use the reg/reg form of the
592 // comparison in order to avoid issues with merged loads.
593 let rhs = put_input_in_reg(ctx, rhs_input);
594 ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));
595
596 let cond_result = match cond_code {
597 FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
598 FloatCC::NotEqual if inverted_equal => {
599 FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
600 }
601 FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
602 _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
603 };
604
605 cond_result
606 }
607
emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type)608 fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type) {
609 let bits = ty.bits();
610 let const_mask = if bits == 64 {
611 0xffff_ffff_ffff_ffff
612 } else {
613 (1u64 << bits) - 1
614 };
615 let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
616 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
617 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
618
619 ctx.emit(Inst::gen_move(tmp0, src, types::I64));
620
621 // Swap 1-bit units.
622 // tmp1 = src
623 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
624 // tmp2 = 0b0101..
625 ctx.emit(Inst::imm(
626 OperandSize::Size64,
627 0x5555_5555_5555_5555 & const_mask,
628 tmp2,
629 ));
630 // tmp1 = src >> 1
631 ctx.emit(Inst::shift_r(
632 OperandSize::Size64,
633 ShiftKind::ShiftRightLogical,
634 Some(1),
635 tmp1,
636 ));
637 // tmp1 = (src >> 1) & 0b0101..
638 ctx.emit(Inst::alu_rmi_r(
639 OperandSize::Size64,
640 AluRmiROpcode::And,
641 RegMemImm::reg(tmp2.to_reg()),
642 tmp1,
643 ));
644 // tmp2 = src & 0b0101..
645 ctx.emit(Inst::alu_rmi_r(
646 OperandSize::Size64,
647 AluRmiROpcode::And,
648 RegMemImm::reg(tmp0.to_reg()),
649 tmp2,
650 ));
651 // tmp2 = (src & 0b0101..) << 1
652 ctx.emit(Inst::shift_r(
653 OperandSize::Size64,
654 ShiftKind::ShiftLeft,
655 Some(1),
656 tmp2,
657 ));
658 // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1
659 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
660 ctx.emit(Inst::alu_rmi_r(
661 OperandSize::Size64,
662 AluRmiROpcode::Or,
663 RegMemImm::reg(tmp1.to_reg()),
664 tmp0,
665 ));
666
667 // Swap 2-bit units.
668 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
669 ctx.emit(Inst::imm(
670 OperandSize::Size64,
671 0x3333_3333_3333_3333 & const_mask,
672 tmp2,
673 ));
674 ctx.emit(Inst::shift_r(
675 OperandSize::Size64,
676 ShiftKind::ShiftRightLogical,
677 Some(2),
678 tmp1,
679 ));
680 ctx.emit(Inst::alu_rmi_r(
681 OperandSize::Size64,
682 AluRmiROpcode::And,
683 RegMemImm::reg(tmp2.to_reg()),
684 tmp1,
685 ));
686 ctx.emit(Inst::alu_rmi_r(
687 OperandSize::Size64,
688 AluRmiROpcode::And,
689 RegMemImm::reg(tmp0.to_reg()),
690 tmp2,
691 ));
692 ctx.emit(Inst::shift_r(
693 OperandSize::Size64,
694 ShiftKind::ShiftLeft,
695 Some(2),
696 tmp2,
697 ));
698 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
699 ctx.emit(Inst::alu_rmi_r(
700 OperandSize::Size64,
701 AluRmiROpcode::Or,
702 RegMemImm::reg(tmp1.to_reg()),
703 tmp0,
704 ));
705
706 // Swap 4-bit units.
707 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
708 ctx.emit(Inst::imm(
709 OperandSize::Size64,
710 0x0f0f_0f0f_0f0f_0f0f & const_mask,
711 tmp2,
712 ));
713 ctx.emit(Inst::shift_r(
714 OperandSize::Size64,
715 ShiftKind::ShiftRightLogical,
716 Some(4),
717 tmp1,
718 ));
719 ctx.emit(Inst::alu_rmi_r(
720 OperandSize::Size64,
721 AluRmiROpcode::And,
722 RegMemImm::reg(tmp2.to_reg()),
723 tmp1,
724 ));
725 ctx.emit(Inst::alu_rmi_r(
726 OperandSize::Size64,
727 AluRmiROpcode::And,
728 RegMemImm::reg(tmp0.to_reg()),
729 tmp2,
730 ));
731 ctx.emit(Inst::shift_r(
732 OperandSize::Size64,
733 ShiftKind::ShiftLeft,
734 Some(4),
735 tmp2,
736 ));
737 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
738 ctx.emit(Inst::alu_rmi_r(
739 OperandSize::Size64,
740 AluRmiROpcode::Or,
741 RegMemImm::reg(tmp1.to_reg()),
742 tmp0,
743 ));
744
745 if bits > 8 {
746 // Swap 8-bit units.
747 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
748 ctx.emit(Inst::imm(
749 OperandSize::Size64,
750 0x00ff_00ff_00ff_00ff & const_mask,
751 tmp2,
752 ));
753 ctx.emit(Inst::shift_r(
754 OperandSize::Size64,
755 ShiftKind::ShiftRightLogical,
756 Some(8),
757 tmp1,
758 ));
759 ctx.emit(Inst::alu_rmi_r(
760 OperandSize::Size64,
761 AluRmiROpcode::And,
762 RegMemImm::reg(tmp2.to_reg()),
763 tmp1,
764 ));
765 ctx.emit(Inst::alu_rmi_r(
766 OperandSize::Size64,
767 AluRmiROpcode::And,
768 RegMemImm::reg(tmp0.to_reg()),
769 tmp2,
770 ));
771 ctx.emit(Inst::shift_r(
772 OperandSize::Size64,
773 ShiftKind::ShiftLeft,
774 Some(8),
775 tmp2,
776 ));
777 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
778 ctx.emit(Inst::alu_rmi_r(
779 OperandSize::Size64,
780 AluRmiROpcode::Or,
781 RegMemImm::reg(tmp1.to_reg()),
782 tmp0,
783 ));
784 }
785
786 if bits > 16 {
787 // Swap 16-bit units.
788 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
789 ctx.emit(Inst::imm(
790 OperandSize::Size64,
791 0x0000_ffff_0000_ffff & const_mask,
792 tmp2,
793 ));
794 ctx.emit(Inst::shift_r(
795 OperandSize::Size64,
796 ShiftKind::ShiftRightLogical,
797 Some(16),
798 tmp1,
799 ));
800 ctx.emit(Inst::alu_rmi_r(
801 OperandSize::Size64,
802 AluRmiROpcode::And,
803 RegMemImm::reg(tmp2.to_reg()),
804 tmp1,
805 ));
806 ctx.emit(Inst::alu_rmi_r(
807 OperandSize::Size64,
808 AluRmiROpcode::And,
809 RegMemImm::reg(tmp0.to_reg()),
810 tmp2,
811 ));
812 ctx.emit(Inst::shift_r(
813 OperandSize::Size64,
814 ShiftKind::ShiftLeft,
815 Some(16),
816 tmp2,
817 ));
818 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
819 ctx.emit(Inst::alu_rmi_r(
820 OperandSize::Size64,
821 AluRmiROpcode::Or,
822 RegMemImm::reg(tmp1.to_reg()),
823 tmp0,
824 ));
825 }
826
827 if bits > 32 {
828 // Swap 32-bit units.
829 ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
830 ctx.emit(Inst::imm(
831 OperandSize::Size64,
832 0x0000_0000_ffff_ffff & const_mask,
833 tmp2,
834 ));
835 ctx.emit(Inst::shift_r(
836 OperandSize::Size64,
837 ShiftKind::ShiftRightLogical,
838 Some(32),
839 tmp1,
840 ));
841 ctx.emit(Inst::alu_rmi_r(
842 OperandSize::Size64,
843 AluRmiROpcode::And,
844 RegMemImm::reg(tmp2.to_reg()),
845 tmp1,
846 ));
847 ctx.emit(Inst::alu_rmi_r(
848 OperandSize::Size64,
849 AluRmiROpcode::And,
850 RegMemImm::reg(tmp0.to_reg()),
851 tmp2,
852 ));
853 ctx.emit(Inst::shift_r(
854 OperandSize::Size64,
855 ShiftKind::ShiftLeft,
856 Some(32),
857 tmp2,
858 ));
859 ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
860 ctx.emit(Inst::alu_rmi_r(
861 OperandSize::Size64,
862 AluRmiROpcode::Or,
863 RegMemImm::reg(tmp1.to_reg()),
864 tmp0,
865 ));
866 }
867
868 ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
869 }
870
emit_shl_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt_src: Reg, )871 fn emit_shl_i128<C: LowerCtx<I = Inst>>(
872 ctx: &mut C,
873 src: ValueRegs<Reg>,
874 dst: ValueRegs<Writable<Reg>>,
875 amt_src: Reg,
876 ) {
877 let src_lo = src.regs()[0];
878 let src_hi = src.regs()[1];
879 let dst_lo = dst.regs()[0];
880 let dst_hi = dst.regs()[1];
881
882 // mov tmp1, src_lo
883 // shl tmp1, amt_src
884 // mov tmp2, src_hi
885 // shl tmp2, amt_src
886 // mov amt, 64
887 // sub amt, amt_src
888 // mov tmp3, src_lo
889 // shr tmp3, amt
890 // xor dst_lo, dst_lo
891 // test amt_src, 127
892 // cmovz tmp3, dst_lo
893 // or tmp3, tmp2
894 // mov amt, amt_src
895 // and amt, 64
896 // cmovz dst_hi, tmp3
897 // cmovz dst_lo, tmp1
898 // cmovnz dst_hi, tmp1
899
900 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
901 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
902 let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
903 let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
904
905 ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
906 ctx.emit(Inst::gen_move(
907 Writable::from_reg(regs::rcx()),
908 amt_src,
909 types::I64,
910 ));
911 ctx.emit(Inst::shift_r(
912 OperandSize::Size64,
913 ShiftKind::ShiftLeft,
914 None,
915 tmp1,
916 ));
917
918 ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
919 ctx.emit(Inst::gen_move(
920 Writable::from_reg(regs::rcx()),
921 amt_src,
922 types::I64,
923 ));
924 ctx.emit(Inst::shift_r(
925 OperandSize::Size64,
926 ShiftKind::ShiftLeft,
927 None,
928 tmp2,
929 ));
930
931 ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
932 ctx.emit(Inst::alu_rmi_r(
933 OperandSize::Size64,
934 AluRmiROpcode::Sub,
935 RegMemImm::reg(amt_src),
936 amt,
937 ));
938
939 ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
940 ctx.emit(Inst::gen_move(
941 Writable::from_reg(regs::rcx()),
942 amt.to_reg(),
943 types::I64,
944 ));
945 ctx.emit(Inst::shift_r(
946 OperandSize::Size64,
947 ShiftKind::ShiftRightLogical,
948 None,
949 tmp3,
950 ));
951 ctx.emit(Inst::alu_rmi_r(
952 OperandSize::Size64,
953 AluRmiROpcode::Xor,
954 RegMemImm::reg(dst_lo.to_reg()),
955 dst_lo,
956 ));
957
958 ctx.emit(Inst::test_rmi_r(
959 OperandSize::Size64,
960 RegMemImm::imm(127),
961 amt_src,
962 ));
963 ctx.emit(Inst::cmove(
964 OperandSize::Size64,
965 CC::Z,
966 RegMem::reg(dst_lo.to_reg()),
967 tmp3,
968 ));
969
970 ctx.emit(Inst::alu_rmi_r(
971 OperandSize::Size64,
972 AluRmiROpcode::Or,
973 RegMemImm::reg(tmp2.to_reg()),
974 tmp3,
975 ));
976
977 // This isn't semantically necessary, but it keeps the
978 // register allocator happy, because it cannot otherwise
979 // infer that cmovz + cmovnz always defines dst_hi.
980 ctx.emit(Inst::alu_rmi_r(
981 OperandSize::Size64,
982 AluRmiROpcode::Xor,
983 RegMemImm::reg(dst_hi.to_reg()),
984 dst_hi,
985 ));
986
987 ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
988 ctx.emit(Inst::alu_rmi_r(
989 OperandSize::Size64,
990 AluRmiROpcode::And,
991 RegMemImm::imm(64),
992 amt,
993 ));
994 ctx.emit(Inst::cmove(
995 OperandSize::Size64,
996 CC::Z,
997 RegMem::reg(tmp3.to_reg()),
998 dst_hi,
999 ));
1000 ctx.emit(Inst::cmove(
1001 OperandSize::Size64,
1002 CC::Z,
1003 RegMem::reg(tmp1.to_reg()),
1004 dst_lo,
1005 ));
1006 ctx.emit(Inst::cmove(
1007 OperandSize::Size64,
1008 CC::NZ,
1009 RegMem::reg(tmp1.to_reg()),
1010 dst_hi,
1011 ));
1012 }
1013
emit_shr_i128<C: LowerCtx<I = Inst>>( ctx: &mut C, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, amt_src: Reg, is_signed: bool, )1014 fn emit_shr_i128<C: LowerCtx<I = Inst>>(
1015 ctx: &mut C,
1016 src: ValueRegs<Reg>,
1017 dst: ValueRegs<Writable<Reg>>,
1018 amt_src: Reg,
1019 is_signed: bool,
1020 ) {
1021 let src_lo = src.regs()[0];
1022 let src_hi = src.regs()[1];
1023 let dst_lo = dst.regs()[0];
1024 let dst_hi = dst.regs()[1];
1025
1026 // mov tmp1, src_hi
1027 // {u,s}shr tmp1, amt_src
1028 // mov tmp2, src_lo
1029 // ushr tmp2, amt_src
1030 // mov amt, 64
1031 // sub amt, amt_src
1032 // mov tmp3, src_hi
1033 // shl tmp3, amt
1034 // xor dst_lo, dst_lo
1035 // test amt_src, 127
1036 // cmovz tmp3, dst_lo
1037 // or tmp3, tmp2
1038 // if is_signed:
1039 // mov dst_hi, src_hi
1040 // sshr dst_hi, 63 // get the sign bit
1041 // else:
1042 // xor dst_hi, dst_hi
1043 // mov amt, amt_src
1044 // and amt, 64
1045 // cmovz dst_hi, tmp1
1046 // cmovz dst_lo, tmp3
1047 // cmovnz dst_lo, tmp1
1048
1049 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1050 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1051 let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1052 let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1053
1054 let shift_kind = if is_signed {
1055 ShiftKind::ShiftRightArithmetic
1056 } else {
1057 ShiftKind::ShiftRightLogical
1058 };
1059
1060 ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
1061 ctx.emit(Inst::gen_move(
1062 Writable::from_reg(regs::rcx()),
1063 amt_src,
1064 types::I64,
1065 ));
1066 ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));
1067
1068 ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
1069 ctx.emit(Inst::gen_move(
1070 Writable::from_reg(regs::rcx()),
1071 amt_src,
1072 types::I64,
1073 ));
1074 // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
1075 ctx.emit(Inst::shift_r(
1076 OperandSize::Size64,
1077 ShiftKind::ShiftRightLogical,
1078 None,
1079 tmp2,
1080 ));
1081
1082 ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
1083 ctx.emit(Inst::alu_rmi_r(
1084 OperandSize::Size64,
1085 AluRmiROpcode::Sub,
1086 RegMemImm::reg(amt_src),
1087 amt,
1088 ));
1089
1090 ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
1091 ctx.emit(Inst::gen_move(
1092 Writable::from_reg(regs::rcx()),
1093 amt.to_reg(),
1094 types::I64,
1095 ));
1096 ctx.emit(Inst::shift_r(
1097 OperandSize::Size64,
1098 ShiftKind::ShiftLeft,
1099 None,
1100 tmp3,
1101 ));
1102
1103 ctx.emit(Inst::alu_rmi_r(
1104 OperandSize::Size64,
1105 AluRmiROpcode::Xor,
1106 RegMemImm::reg(dst_lo.to_reg()),
1107 dst_lo,
1108 ));
1109 ctx.emit(Inst::test_rmi_r(
1110 OperandSize::Size64,
1111 RegMemImm::imm(127),
1112 amt_src,
1113 ));
1114 ctx.emit(Inst::cmove(
1115 OperandSize::Size64,
1116 CC::Z,
1117 RegMem::reg(dst_lo.to_reg()),
1118 tmp3,
1119 ));
1120
1121 ctx.emit(Inst::alu_rmi_r(
1122 OperandSize::Size64,
1123 AluRmiROpcode::Or,
1124 RegMemImm::reg(tmp2.to_reg()),
1125 tmp3,
1126 ));
1127
1128 if is_signed {
1129 ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
1130 ctx.emit(Inst::shift_r(
1131 OperandSize::Size64,
1132 ShiftKind::ShiftRightArithmetic,
1133 Some(63),
1134 dst_hi,
1135 ));
1136 } else {
1137 ctx.emit(Inst::alu_rmi_r(
1138 OperandSize::Size64,
1139 AluRmiROpcode::Xor,
1140 RegMemImm::reg(dst_hi.to_reg()),
1141 dst_hi,
1142 ));
1143 }
1144 // This isn't semantically necessary, but it keeps the
1145 // register allocator happy, because it cannot otherwise
1146 // infer that cmovz + cmovnz always defines dst_lo.
1147 ctx.emit(Inst::alu_rmi_r(
1148 OperandSize::Size64,
1149 AluRmiROpcode::Xor,
1150 RegMemImm::reg(dst_lo.to_reg()),
1151 dst_lo,
1152 ));
1153
1154 ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
1155 ctx.emit(Inst::alu_rmi_r(
1156 OperandSize::Size64,
1157 AluRmiROpcode::And,
1158 RegMemImm::imm(64),
1159 amt,
1160 ));
1161 ctx.emit(Inst::cmove(
1162 OperandSize::Size64,
1163 CC::Z,
1164 RegMem::reg(tmp1.to_reg()),
1165 dst_hi,
1166 ));
1167 ctx.emit(Inst::cmove(
1168 OperandSize::Size64,
1169 CC::Z,
1170 RegMem::reg(tmp3.to_reg()),
1171 dst_lo,
1172 ));
1173 ctx.emit(Inst::cmove(
1174 OperandSize::Size64,
1175 CC::NZ,
1176 RegMem::reg(tmp1.to_reg()),
1177 dst_lo,
1178 ));
1179 }
1180
make_libcall_sig<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, call_conv: CallConv, ptr_ty: Type, ) -> Signature1181 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
1182 ctx: &mut C,
1183 insn: IRInst,
1184 call_conv: CallConv,
1185 ptr_ty: Type,
1186 ) -> Signature {
1187 let mut sig = Signature::new(call_conv);
1188 for i in 0..ctx.num_inputs(insn) {
1189 sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
1190 }
1191 for i in 0..ctx.num_outputs(insn) {
1192 sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
1193 }
1194 if call_conv.extends_baldrdash() {
1195 // Adds the special VMContext parameter to the signature.
1196 sig.params
1197 .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
1198 }
1199 sig
1200 }
1201
emit_vm_call<C: LowerCtx<I = Inst>>( ctx: &mut C, flags: &Flags, triple: &Triple, libcall: LibCall, insn: IRInst, inputs: SmallVec<[InsnInput; 4]>, outputs: SmallVec<[InsnOutput; 2]>, ) -> CodegenResult<()>1202 fn emit_vm_call<C: LowerCtx<I = Inst>>(
1203 ctx: &mut C,
1204 flags: &Flags,
1205 triple: &Triple,
1206 libcall: LibCall,
1207 insn: IRInst,
1208 inputs: SmallVec<[InsnInput; 4]>,
1209 outputs: SmallVec<[InsnOutput; 2]>,
1210 ) -> CodegenResult<()> {
1211 let extname = ExternalName::LibCall(libcall);
1212
1213 let dist = if flags.use_colocated_libcalls() {
1214 RelocDistance::Near
1215 } else {
1216 RelocDistance::Far
1217 };
1218
1219 // TODO avoid recreating signatures for every single Libcall function.
1220 let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
1221 let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
1222 let caller_conv = ctx.abi().call_conv();
1223
1224 let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv, flags)?;
1225
1226 abi.emit_stack_pre_adjust(ctx);
1227
1228 let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
1229 assert_eq!(inputs.len() + vm_context, abi.num_args());
1230
1231 for (i, input) in inputs.iter().enumerate() {
1232 let arg_reg = put_input_in_reg(ctx, *input);
1233 abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
1234 }
1235 if call_conv.extends_baldrdash() {
1236 let vm_context_vreg = ctx
1237 .get_vm_context()
1238 .expect("should have a VMContext to pass to libcall funcs");
1239 abi.emit_copy_regs_to_arg(ctx, inputs.len(), ValueRegs::one(vm_context_vreg));
1240 }
1241
1242 abi.emit_call(ctx);
1243 for (i, output) in outputs.iter().enumerate() {
1244 let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
1245 abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
1246 }
1247 abi.emit_stack_post_adjust(ctx);
1248
1249 Ok(())
1250 }
1251
1252 /// Returns whether the given input is a shift by a constant value less or equal than 3.
1253 /// The goal is to embed it within an address mode.
matches_small_constant_shift<C: LowerCtx<I = Inst>>( ctx: &mut C, spec: InsnInput, ) -> Option<(InsnInput, u8)>1254 fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
1255 ctx: &mut C,
1256 spec: InsnInput,
1257 ) -> Option<(InsnInput, u8)> {
1258 matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
1259 match input_to_imm(
1260 ctx,
1261 InsnInput {
1262 insn: shift,
1263 input: 1,
1264 },
1265 ) {
1266 Some(shift_amt) if shift_amt <= 3 => Some((
1267 InsnInput {
1268 insn: shift,
1269 input: 0,
1270 },
1271 shift_amt as u8,
1272 )),
1273 _ => None,
1274 }
1275 })
1276 }
1277
1278 /// Lowers an instruction to one of the x86 addressing modes.
1279 ///
1280 /// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode1281 fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode {
1282 let flags = ctx
1283 .memflags(spec.insn)
1284 .expect("Instruction with amode should have memflags");
1285
1286 // We now either have an add that we must materialize, or some other input; as well as the
1287 // final offset.
1288 if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
1289 debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
1290 let add_inputs = &[
1291 InsnInput {
1292 insn: add,
1293 input: 0,
1294 },
1295 InsnInput {
1296 insn: add,
1297 input: 1,
1298 },
1299 ];
1300
1301 // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
1302 // aren't happening in the wasm case. We could do better, given some range analysis.
1303 let (base, index, shift) = if let Some((shift_input, shift_amt)) =
1304 matches_small_constant_shift(ctx, add_inputs[0])
1305 {
1306 (
1307 put_input_in_reg(ctx, add_inputs[1]),
1308 put_input_in_reg(ctx, shift_input),
1309 shift_amt,
1310 )
1311 } else if let Some((shift_input, shift_amt)) =
1312 matches_small_constant_shift(ctx, add_inputs[1])
1313 {
1314 (
1315 put_input_in_reg(ctx, add_inputs[0]),
1316 put_input_in_reg(ctx, shift_input),
1317 shift_amt,
1318 )
1319 } else {
1320 for i in 0..=1 {
1321 // Try to pierce through uextend.
1322 if let Some(uextend) = matches_input(
1323 ctx,
1324 InsnInput {
1325 insn: add,
1326 input: i,
1327 },
1328 Opcode::Uextend,
1329 ) {
1330 if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
1331 // Zero the upper bits.
1332 let input_size = ctx.input_ty(uextend, 0).bits() as u64;
1333 let shift: u64 = 64 - input_size;
1334 let uext_cst: u64 = (cst << shift) >> shift;
1335
1336 let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
1337 if low32_will_sign_extend_to_64(final_offset as u64) {
1338 let base = put_input_in_reg(ctx, add_inputs[1 - i]);
1339 return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
1340 }
1341 }
1342 }
1343
1344 // If it's a constant, add it directly!
1345 if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
1346 let final_offset = (offset as i64).wrapping_add(cst as i64);
1347 if low32_will_sign_extend_to_64(final_offset as u64) {
1348 let base = put_input_in_reg(ctx, add_inputs[1 - i]);
1349 return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
1350 }
1351 }
1352 }
1353
1354 (
1355 put_input_in_reg(ctx, add_inputs[0]),
1356 put_input_in_reg(ctx, add_inputs[1]),
1357 0,
1358 )
1359 };
1360
1361 return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags);
1362 }
1363
1364 let input = put_input_in_reg(ctx, spec);
1365 Amode::imm_reg(offset as u32, input).with_flags(flags)
1366 }
1367
emit_moves<C: LowerCtx<I = Inst>>( ctx: &mut C, dst: ValueRegs<Writable<Reg>>, src: ValueRegs<Reg>, ty: Type, )1368 fn emit_moves<C: LowerCtx<I = Inst>>(
1369 ctx: &mut C,
1370 dst: ValueRegs<Writable<Reg>>,
1371 src: ValueRegs<Reg>,
1372 ty: Type,
1373 ) {
1374 let (_, tys) = Inst::rc_for_type(ty).unwrap();
1375 for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
1376 ctx.emit(Inst::gen_move(*dst, *src, *ty));
1377 }
1378 }
1379
emit_cmoves<C: LowerCtx<I = Inst>>( ctx: &mut C, size: u8, cc: CC, src: ValueRegs<Reg>, dst: ValueRegs<Writable<Reg>>, )1380 fn emit_cmoves<C: LowerCtx<I = Inst>>(
1381 ctx: &mut C,
1382 size: u8,
1383 cc: CC,
1384 src: ValueRegs<Reg>,
1385 dst: ValueRegs<Writable<Reg>>,
1386 ) {
1387 let size = size / src.len() as u8;
1388 let size = u8::max(size, 4); // at least 32 bits
1389 for (dst, src) in dst.regs().iter().zip(src.regs().iter()) {
1390 ctx.emit(Inst::cmove(
1391 OperandSize::from_bytes(size.into()),
1392 cc,
1393 RegMem::reg(*src),
1394 *dst,
1395 ));
1396 }
1397 }
1398
emit_clz<C: LowerCtx<I = Inst>>( ctx: &mut C, orig_ty: Type, ty: Type, src: Reg, dst: Writable<Reg>, )1399 fn emit_clz<C: LowerCtx<I = Inst>>(
1400 ctx: &mut C,
1401 orig_ty: Type,
1402 ty: Type,
1403 src: Reg,
1404 dst: Writable<Reg>,
1405 ) {
1406 let src = RegMem::reg(src);
1407 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1408 ctx.emit(Inst::imm(OperandSize::from_ty(ty), u64::max_value(), dst));
1409
1410 ctx.emit(Inst::unary_rm_r(
1411 OperandSize::from_ty(ty),
1412 UnaryRmROpcode::Bsr,
1413 src,
1414 tmp,
1415 ));
1416
1417 ctx.emit(Inst::cmove(
1418 OperandSize::from_ty(ty),
1419 CC::Z,
1420 RegMem::reg(dst.to_reg()),
1421 tmp,
1422 ));
1423
1424 ctx.emit(Inst::imm(
1425 OperandSize::from_ty(ty),
1426 orig_ty.bits() as u64 - 1,
1427 dst,
1428 ));
1429
1430 ctx.emit(Inst::alu_rmi_r(
1431 if ty == types::I64 {
1432 OperandSize::Size64
1433 } else {
1434 OperandSize::Size32
1435 },
1436 AluRmiROpcode::Sub,
1437 RegMemImm::reg(tmp.to_reg()),
1438 dst,
1439 ));
1440 }
1441
emit_ctz<C: LowerCtx<I = Inst>>( ctx: &mut C, orig_ty: Type, ty: Type, src: Reg, dst: Writable<Reg>, )1442 fn emit_ctz<C: LowerCtx<I = Inst>>(
1443 ctx: &mut C,
1444 orig_ty: Type,
1445 ty: Type,
1446 src: Reg,
1447 dst: Writable<Reg>,
1448 ) {
1449 let src = RegMem::reg(src);
1450 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1451 ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp));
1452
1453 ctx.emit(Inst::unary_rm_r(
1454 OperandSize::from_ty(ty),
1455 UnaryRmROpcode::Bsf,
1456 src,
1457 dst,
1458 ));
1459
1460 ctx.emit(Inst::cmove(
1461 OperandSize::from_ty(ty),
1462 CC::Z,
1463 RegMem::reg(tmp.to_reg()),
1464 dst,
1465 ));
1466 }
1467
1468 //=============================================================================
1469 // Top-level instruction lowering entry point, for one instruction.
1470
1471 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &x64_settings::Flags, triple: &Triple, ) -> CodegenResult<()>1472 fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
1473 ctx: &mut C,
1474 insn: IRInst,
1475 flags: &Flags,
1476 isa_flags: &x64_settings::Flags,
1477 triple: &Triple,
1478 ) -> CodegenResult<()> {
1479 let op = ctx.data(insn).opcode();
1480
1481 let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
1482 .map(|i| InsnInput { insn, input: i })
1483 .collect();
1484 let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
1485 .map(|i| InsnOutput { insn, output: i })
1486 .collect();
1487
1488 let ty = if outputs.len() > 0 {
1489 Some(ctx.output_ty(insn, 0))
1490 } else {
1491 None
1492 };
1493
1494 match op {
1495 Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
1496 let value = ctx
1497 .get_constant(insn)
1498 .expect("constant value for iconst et al");
1499 let dst = get_output_reg(ctx, outputs[0]);
1500 for inst in Inst::gen_constant(dst, value as u128, ty.unwrap(), |ty| {
1501 ctx.alloc_tmp(ty).only_reg().unwrap()
1502 }) {
1503 ctx.emit(inst);
1504 }
1505 }
1506
1507 Opcode::Iadd
1508 | Opcode::IaddIfcout
1509 | Opcode::SaddSat
1510 | Opcode::UaddSat
1511 | Opcode::Isub
1512 | Opcode::SsubSat
1513 | Opcode::UsubSat
1514 | Opcode::AvgRound
1515 | Opcode::Band
1516 | Opcode::Bor
1517 | Opcode::Bxor => {
1518 let ty = ty.unwrap();
1519 if ty.lane_count() > 1 {
1520 let sse_op = match op {
1521 Opcode::Iadd => match ty {
1522 types::I8X16 => SseOpcode::Paddb,
1523 types::I16X8 => SseOpcode::Paddw,
1524 types::I32X4 => SseOpcode::Paddd,
1525 types::I64X2 => SseOpcode::Paddq,
1526 _ => panic!("Unsupported type for packed iadd instruction: {}", ty),
1527 },
1528 Opcode::SaddSat => match ty {
1529 types::I8X16 => SseOpcode::Paddsb,
1530 types::I16X8 => SseOpcode::Paddsw,
1531 _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty),
1532 },
1533 Opcode::UaddSat => match ty {
1534 types::I8X16 => SseOpcode::Paddusb,
1535 types::I16X8 => SseOpcode::Paddusw,
1536 _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty),
1537 },
1538 Opcode::Isub => match ty {
1539 types::I8X16 => SseOpcode::Psubb,
1540 types::I16X8 => SseOpcode::Psubw,
1541 types::I32X4 => SseOpcode::Psubd,
1542 types::I64X2 => SseOpcode::Psubq,
1543 _ => panic!("Unsupported type for packed isub instruction: {}", ty),
1544 },
1545 Opcode::SsubSat => match ty {
1546 types::I8X16 => SseOpcode::Psubsb,
1547 types::I16X8 => SseOpcode::Psubsw,
1548 _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty),
1549 },
1550 Opcode::UsubSat => match ty {
1551 types::I8X16 => SseOpcode::Psubusb,
1552 types::I16X8 => SseOpcode::Psubusw,
1553 _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
1554 },
1555 Opcode::AvgRound => match ty {
1556 types::I8X16 => SseOpcode::Pavgb,
1557 types::I16X8 => SseOpcode::Pavgw,
1558 _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
1559 },
1560 Opcode::Band => match ty {
1561 types::F32X4 => SseOpcode::Andps,
1562 types::F64X2 => SseOpcode::Andpd,
1563 _ => SseOpcode::Pand,
1564 },
1565 Opcode::Bor => match ty {
1566 types::F32X4 => SseOpcode::Orps,
1567 types::F64X2 => SseOpcode::Orpd,
1568 _ => SseOpcode::Por,
1569 },
1570 Opcode::Bxor => match ty {
1571 types::F32X4 => SseOpcode::Xorps,
1572 types::F64X2 => SseOpcode::Xorpd,
1573 _ => SseOpcode::Pxor,
1574 },
1575 _ => panic!("Unsupported packed instruction: {}", op),
1576 };
1577 let lhs = put_input_in_reg(ctx, inputs[0]);
1578 let rhs = input_to_reg_mem(ctx, inputs[1]);
1579 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1580
1581 // Move the `lhs` to the same register as `dst`.
1582 ctx.emit(Inst::gen_move(dst, lhs, ty));
1583 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1584 } else if ty == types::I128 || ty == types::B128 {
1585 let alu_ops = match op {
1586 Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
1587 Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
1588 Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
1589 Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
1590 Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
1591 _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
1592 };
1593 let lhs = put_input_in_regs(ctx, inputs[0]);
1594 let rhs = put_input_in_regs(ctx, inputs[1]);
1595 let dst = get_output_reg(ctx, outputs[0]);
1596 assert_eq!(lhs.len(), 2);
1597 assert_eq!(rhs.len(), 2);
1598 assert_eq!(dst.len(), 2);
1599
1600 // For add, sub, and, or, xor: just do ops on lower then upper
1601 // half. Carry-flag propagation is implicit (add/adc, sub/sbb).
1602 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
1603 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
1604 ctx.emit(Inst::alu_rmi_r(
1605 OperandSize::Size64,
1606 alu_ops.0,
1607 RegMemImm::reg(rhs.regs()[0]),
1608 dst.regs()[0],
1609 ));
1610 ctx.emit(Inst::alu_rmi_r(
1611 OperandSize::Size64,
1612 alu_ops.1,
1613 RegMemImm::reg(rhs.regs()[1]),
1614 dst.regs()[1],
1615 ));
1616 } else {
1617 let size = if ty == types::I64 {
1618 OperandSize::Size64
1619 } else {
1620 OperandSize::Size32
1621 };
1622 let alu_op = match op {
1623 Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
1624 Opcode::Isub => AluRmiROpcode::Sub,
1625 Opcode::Band => AluRmiROpcode::And,
1626 Opcode::Bor => AluRmiROpcode::Or,
1627 Opcode::Bxor => AluRmiROpcode::Xor,
1628 _ => unreachable!(),
1629 };
1630
1631 let (lhs, rhs) = match op {
1632 Opcode::Iadd
1633 | Opcode::IaddIfcout
1634 | Opcode::Band
1635 | Opcode::Bor
1636 | Opcode::Bxor => {
1637 // For commutative operations, try to commute operands if one is an
1638 // immediate or direct memory reference. Do so by converting LHS to RMI; if
1639 // reg, then always convert RHS to RMI; else, use LHS as RMI and convert
1640 // RHS to reg.
1641 let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
1642 if let RegMemImm::Reg { reg: lhs_reg } = lhs {
1643 let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
1644 (lhs_reg, rhs)
1645 } else {
1646 let rhs_reg = put_input_in_reg(ctx, inputs[1]);
1647 (rhs_reg, lhs)
1648 }
1649 }
1650 Opcode::Isub => (
1651 put_input_in_reg(ctx, inputs[0]),
1652 input_to_reg_mem_imm(ctx, inputs[1]),
1653 ),
1654 _ => unreachable!(),
1655 };
1656
1657 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1658 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
1659 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
1660 }
1661 }
1662
1663 Opcode::Imul => {
1664 let ty = ty.unwrap();
1665 if ty == types::I64X2 {
1666 // Eventually one of these should be `input_to_reg_mem` (TODO).
1667 let lhs = put_input_in_reg(ctx, inputs[0]);
1668 let rhs = put_input_in_reg(ctx, inputs[1]);
1669 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1670
1671 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
1672 // With the right AVX512 features (VL, DQ) this operation
1673 // can lower to a single operation.
1674 ctx.emit(Inst::xmm_rm_r_evex(
1675 Avx512Opcode::Vpmullq,
1676 RegMem::reg(rhs),
1677 lhs,
1678 dst,
1679 ));
1680 } else {
1681 // Otherwise, for I64X2 multiplication we describe a lane A as being
1682 // composed of a 32-bit upper half "Ah" and a 32-bit lower half
1683 // "Al". The 32-bit long hand multiplication can then be written
1684 // as:
1685 // Ah Al
1686 // * Bh Bl
1687 // -----
1688 // Al * Bl
1689 // + (Ah * Bl) << 32
1690 // + (Al * Bh) << 32
1691 //
1692 // So for each lane we will compute:
1693 // A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
1694 //
1695 // Note, the algorithm will use pmuldq which operates directly
1696 // on the lower 32-bit (Al or Bl) of a lane and writes the
1697 // result to the full 64-bits of the lane of the destination.
1698 // For this reason we don't need shifts to isolate the lower
1699 // 32-bits, however, we will need to use shifts to isolate the
1700 // high 32-bits when doing calculations, i.e., Ah == A >> 32.
1701 //
1702 // The full sequence then is as follows:
1703 // A' = A
1704 // A' = A' >> 32
1705 // A' = Ah' * Bl
1706 // B' = B
1707 // B' = B' >> 32
1708 // B' = Bh' * Al
1709 // B' = B' + A'
1710 // B' = B' << 32
1711 // A' = A
1712 // A' = Al' * Bl
1713 // A' = A' + B'
1714 // dst = A'
1715
1716 // A' = A
1717 let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
1718 ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
1719
1720 // A' = A' >> 32
1721 // A' = Ah' * Bl
1722 ctx.emit(Inst::xmm_rmi_reg(
1723 SseOpcode::Psrlq,
1724 RegMemImm::imm(32),
1725 rhs_1,
1726 ));
1727 ctx.emit(Inst::xmm_rm_r(
1728 SseOpcode::Pmuludq,
1729 RegMem::reg(lhs.clone()),
1730 rhs_1,
1731 ));
1732
1733 // B' = B
1734 let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
1735 ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
1736
1737 // B' = B' >> 32
1738 // B' = Bh' * Al
1739 ctx.emit(Inst::xmm_rmi_reg(
1740 SseOpcode::Psrlq,
1741 RegMemImm::imm(32),
1742 lhs_1,
1743 ));
1744 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
1745
1746 // B' = B' + A'
1747 // B' = B' << 32
1748 ctx.emit(Inst::xmm_rm_r(
1749 SseOpcode::Paddq,
1750 RegMem::reg(rhs_1.to_reg()),
1751 lhs_1,
1752 ));
1753 ctx.emit(Inst::xmm_rmi_reg(
1754 SseOpcode::Psllq,
1755 RegMemImm::imm(32),
1756 lhs_1,
1757 ));
1758
1759 // A' = A
1760 // A' = Al' * Bl
1761 // A' = A' + B'
1762 // dst = A'
1763 ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
1764 ctx.emit(Inst::xmm_rm_r(
1765 SseOpcode::Pmuludq,
1766 RegMem::reg(lhs.clone()),
1767 rhs_1,
1768 ));
1769 ctx.emit(Inst::xmm_rm_r(
1770 SseOpcode::Paddq,
1771 RegMem::reg(lhs_1.to_reg()),
1772 rhs_1,
1773 ));
1774 ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
1775 }
1776 } else if ty.lane_count() > 1 {
1777 // Emit single instruction lowerings for the remaining vector
1778 // multiplications.
1779 let sse_op = match ty {
1780 types::I16X8 => SseOpcode::Pmullw,
1781 types::I32X4 => SseOpcode::Pmulld,
1782 _ => panic!("Unsupported type for packed imul instruction: {}", ty),
1783 };
1784 let lhs = put_input_in_reg(ctx, inputs[0]);
1785 let rhs = input_to_reg_mem(ctx, inputs[1]);
1786 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1787
1788 // Move the `lhs` to the same register as `dst`.
1789 ctx.emit(Inst::gen_move(dst, lhs, ty));
1790 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1791 } else if ty == types::I128 || ty == types::B128 {
1792 // Handle 128-bit multiplications.
1793 let lhs = put_input_in_regs(ctx, inputs[0]);
1794 let rhs = put_input_in_regs(ctx, inputs[1]);
1795 let dst = get_output_reg(ctx, outputs[0]);
1796 assert_eq!(lhs.len(), 2);
1797 assert_eq!(rhs.len(), 2);
1798 assert_eq!(dst.len(), 2);
1799
1800 // mul:
1801 // dst_lo = lhs_lo * rhs_lo
1802 // dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
1803 //
1804 // so we emit:
1805 // mov dst_lo, lhs_lo
1806 // mul dst_lo, rhs_lo
1807 // mov dst_hi, lhs_lo
1808 // mul dst_hi, rhs_hi
1809 // mov tmp, lhs_hi
1810 // mul tmp, rhs_lo
1811 // add dst_hi, tmp
1812 // mov rax, lhs_lo
1813 // umulhi rhs_lo // implicit rax arg/dst
1814 // add dst_hi, rax
1815 let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
1816 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
1817 ctx.emit(Inst::alu_rmi_r(
1818 OperandSize::Size64,
1819 AluRmiROpcode::Mul,
1820 RegMemImm::reg(rhs.regs()[0]),
1821 dst.regs()[0],
1822 ));
1823 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
1824 ctx.emit(Inst::alu_rmi_r(
1825 OperandSize::Size64,
1826 AluRmiROpcode::Mul,
1827 RegMemImm::reg(rhs.regs()[1]),
1828 dst.regs()[1],
1829 ));
1830 ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
1831 ctx.emit(Inst::alu_rmi_r(
1832 OperandSize::Size64,
1833 AluRmiROpcode::Mul,
1834 RegMemImm::reg(rhs.regs()[0]),
1835 tmp,
1836 ));
1837 ctx.emit(Inst::alu_rmi_r(
1838 OperandSize::Size64,
1839 AluRmiROpcode::Add,
1840 RegMemImm::reg(tmp.to_reg()),
1841 dst.regs()[1],
1842 ));
1843 ctx.emit(Inst::gen_move(
1844 Writable::from_reg(regs::rax()),
1845 lhs.regs()[0],
1846 types::I64,
1847 ));
1848 ctx.emit(Inst::mul_hi(
1849 OperandSize::Size64,
1850 /* signed = */ false,
1851 RegMem::reg(rhs.regs()[0]),
1852 ));
1853 ctx.emit(Inst::alu_rmi_r(
1854 OperandSize::Size64,
1855 AluRmiROpcode::Add,
1856 RegMemImm::reg(regs::rdx()),
1857 dst.regs()[1],
1858 ));
1859 } else {
1860 let size = if ty == types::I64 {
1861 OperandSize::Size64
1862 } else {
1863 OperandSize::Size32
1864 };
1865 let alu_op = AluRmiROpcode::Mul;
1866
1867 // For commutative operations, try to commute operands if one is
1868 // an immediate or direct memory reference. Do so by converting
1869 // LHS to RMI; if reg, then always convert RHS to RMI; else, use
1870 // LHS as RMI and convert RHS to reg.
1871 let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
1872 let (lhs, rhs) = if let RegMemImm::Reg { reg: lhs_reg } = lhs {
1873 let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
1874 (lhs_reg, rhs)
1875 } else {
1876 let rhs_reg = put_input_in_reg(ctx, inputs[1]);
1877 (rhs_reg, lhs)
1878 };
1879
1880 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1881 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
1882 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
1883 }
1884 }
1885
1886 Opcode::BandNot => {
1887 let ty = ty.unwrap();
1888 debug_assert!(ty.is_vector() && ty.bytes() == 16);
1889 let lhs = input_to_reg_mem(ctx, inputs[0]);
1890 let rhs = put_input_in_reg(ctx, inputs[1]);
1891 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1892 let sse_op = match ty {
1893 types::F32X4 => SseOpcode::Andnps,
1894 types::F64X2 => SseOpcode::Andnpd,
1895 _ => SseOpcode::Pandn,
1896 };
1897 // Note the flipping of operands: the `rhs` operand is used as the destination instead
1898 // of the `lhs` as in the other bit operations above (e.g. `band`).
1899 ctx.emit(Inst::gen_move(dst, rhs, ty));
1900 ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
1901 }
1902
1903 Opcode::Iabs => {
1904 let src = input_to_reg_mem(ctx, inputs[0]);
1905 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1906 let ty = ty.unwrap();
1907 if ty == types::I64X2 {
1908 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
1909 ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst));
1910 } else {
1911 // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to
1912 // contain the results of `0 - src` and then blend in those results with
1913 // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or,
1914 // conversely, if `src` was originally positive).
1915
1916 // Emit all 0s into the `tmp` register.
1917 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1918 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
1919 // Subtract the lanes from 0 and set up `dst`.
1920 ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp));
1921 ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty));
1922 // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics
1923 // require the "choice" mask to be in XMM0.
1924 ctx.emit(Inst::gen_move(
1925 Writable::from_reg(regs::xmm0()),
1926 tmp.to_reg(),
1927 ty,
1928 ));
1929 ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
1930 }
1931 } else if ty.is_vector() {
1932 let opcode = match ty {
1933 types::I8X16 => SseOpcode::Pabsb,
1934 types::I16X8 => SseOpcode::Pabsw,
1935 types::I32X4 => SseOpcode::Pabsd,
1936 _ => panic!("Unsupported type for packed iabs instruction: {}", ty),
1937 };
1938 ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst));
1939 } else {
1940 unimplemented!("iabs is unimplemented for non-vector type: {}", ty);
1941 }
1942 }
1943
1944 Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
1945 let lhs = put_input_in_reg(ctx, inputs[0]);
1946 let rhs = input_to_reg_mem(ctx, inputs[1]);
1947 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1948 let ty = ty.unwrap();
1949 if ty.is_vector() {
1950 let sse_op = match op {
1951 Opcode::Imax => match ty {
1952 types::I8X16 => SseOpcode::Pmaxsb,
1953 types::I16X8 => SseOpcode::Pmaxsw,
1954 types::I32X4 => SseOpcode::Pmaxsd,
1955 _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1956 },
1957 Opcode::Umax => match ty {
1958 types::I8X16 => SseOpcode::Pmaxub,
1959 types::I16X8 => SseOpcode::Pmaxuw,
1960 types::I32X4 => SseOpcode::Pmaxud,
1961 _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1962 },
1963 Opcode::Imin => match ty {
1964 types::I8X16 => SseOpcode::Pminsb,
1965 types::I16X8 => SseOpcode::Pminsw,
1966 types::I32X4 => SseOpcode::Pminsd,
1967 _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1968 },
1969 Opcode::Umin => match ty {
1970 types::I8X16 => SseOpcode::Pminub,
1971 types::I16X8 => SseOpcode::Pminuw,
1972 types::I32X4 => SseOpcode::Pminud,
1973 _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
1974 },
1975 _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."),
1976 };
1977
1978 // Move the `lhs` to the same register as `dst`.
1979 ctx.emit(Inst::gen_move(dst, lhs, ty));
1980 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1981 } else {
1982 panic!("Unsupported type for {} instruction: {}", op, ty);
1983 }
1984 }
1985
1986 Opcode::Bnot => {
1987 let ty = ty.unwrap();
1988
1989 if ty.is_vector() {
1990 let src = put_input_in_reg(ctx, inputs[0]);
1991 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1992 ctx.emit(Inst::gen_move(dst, src, ty));
1993 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
1994 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
1995 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
1996 } else if ty == types::I128 || ty == types::B128 {
1997 let src = put_input_in_regs(ctx, inputs[0]);
1998 let dst = get_output_reg(ctx, outputs[0]);
1999 ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64));
2000 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[0]));
2001 ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64));
2002 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[1]));
2003 } else if ty.is_bool() {
2004 unimplemented!("bool bnot")
2005 } else {
2006 let src = put_input_in_reg(ctx, inputs[0]);
2007 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2008 ctx.emit(Inst::gen_move(dst, src, ty));
2009 ctx.emit(Inst::not(OperandSize::from_ty(ty), dst));
2010 }
2011 }
2012
2013 Opcode::Bitselect => {
2014 let ty = ty.unwrap();
2015 let condition = put_input_in_reg(ctx, inputs[0]);
2016 let if_true = put_input_in_reg(ctx, inputs[1]);
2017 let if_false = input_to_reg_mem(ctx, inputs[2]);
2018 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2019
2020 if ty.is_vector() {
2021 let tmp1 = ctx.alloc_tmp(ty).only_reg().unwrap();
2022 ctx.emit(Inst::gen_move(tmp1, if_true, ty));
2023 ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
2024
2025 let tmp2 = ctx.alloc_tmp(ty).only_reg().unwrap();
2026 ctx.emit(Inst::gen_move(tmp2, condition, ty));
2027 ctx.emit(Inst::and_not(ty, if_false, tmp2));
2028
2029 ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
2030 ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
2031 } else {
2032 unimplemented!("no lowering for scalar bitselect instruction")
2033 }
2034 }
2035
2036 Opcode::Vselect => {
2037 let ty = ty.unwrap();
2038 let condition = put_input_in_reg(ctx, inputs[0]);
2039 let condition_ty = ctx.input_ty(insn, 0);
2040 let if_true = input_to_reg_mem(ctx, inputs[1]);
2041 let if_false = put_input_in_reg(ctx, inputs[2]);
2042 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2043
2044 if ty.is_vector() {
2045 // `vselect` relies on the bit representation of the condition:
2046 // vector boolean types are defined in Cranelift to be all 1s or
2047 // all 0s. This lowering relies on that fact to use x86's
2048 // variable blend instructions, which look at the _high_bit_ of
2049 // the condition mask. All the bits of vector booleans will
2050 // match (all 1s or all 0s), so we can just use the high bit.
2051 assert!(condition_ty.lane_type().is_bool());
2052
2053 // Variable blend instructions expect the condition mask to be
2054 // in XMM0.
2055 let xmm0 = Writable::from_reg(regs::xmm0());
2056 ctx.emit(Inst::gen_move(xmm0, condition, ty));
2057
2058 // Match up the source and destination registers for regalloc.
2059 ctx.emit(Inst::gen_move(dst, if_false, ty));
2060
2061 // Technically PBLENDVB would work in all cases (since the bytes
2062 // inside the mask will be all 1s or 0s we can blend
2063 // byte-by-byte instead of word-by-word, e.g.) but
2064 // type-specialized versions are included here for clarity when
2065 // troubleshooting and due to slight improvements in
2066 // latency/throughput on certain processor families.
2067 let opcode = match condition_ty {
2068 types::B64X2 => SseOpcode::Blendvpd,
2069 types::B32X4 => SseOpcode::Blendvps,
2070 types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
2071 _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
2072 };
2073 ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
2074 } else {
2075 unimplemented!("no lowering for scalar vselect instruction")
2076 }
2077 }
2078
2079 Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
2080 let dst_ty = ctx.output_ty(insn, 0);
2081 debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
2082
2083 if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
2084 // Scalar shifts on x86 have various encodings:
2085 // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
2086 // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
2087 // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
2088 // This implementation uses the last two encoding methods.
2089 let (size, lhs) = match dst_ty {
2090 types::I8 | types::I16 => match op {
2091 Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
2092 Opcode::Ushr => (
2093 OperandSize::Size32,
2094 extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
2095 ),
2096 Opcode::Sshr => (
2097 OperandSize::Size32,
2098 extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
2099 ),
2100 Opcode::Rotl | Opcode::Rotr => (
2101 OperandSize::from_ty(dst_ty),
2102 put_input_in_reg(ctx, inputs[0]),
2103 ),
2104 _ => unreachable!(),
2105 },
2106 types::I32 | types::I64 => (
2107 OperandSize::from_ty(dst_ty),
2108 put_input_in_reg(ctx, inputs[0]),
2109 ),
2110 _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
2111 };
2112
2113 let (count, rhs) =
2114 if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
2115 // Mask count, according to Cranelift's semantics.
2116 let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
2117 (Some(cst), None)
2118 } else {
2119 // We can ignore upper registers if shift amount is multi-reg, because we
2120 // are taking the shift amount mod 2^(lhs_width) anyway.
2121 (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
2122 };
2123
2124 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2125
2126 let shift_kind = match op {
2127 Opcode::Ishl => ShiftKind::ShiftLeft,
2128 Opcode::Ushr => ShiftKind::ShiftRightLogical,
2129 Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
2130 Opcode::Rotl => ShiftKind::RotateLeft,
2131 Opcode::Rotr => ShiftKind::RotateRight,
2132 _ => unreachable!(),
2133 };
2134
2135 let w_rcx = Writable::from_reg(regs::rcx());
2136 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
2137 if count.is_none() {
2138 ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
2139 }
2140 ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
2141 } else if dst_ty == types::I128 {
2142 let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
2143 let src = put_input_in_regs(ctx, inputs[0]);
2144 let dst = get_output_reg(ctx, outputs[0]);
2145
2146 match op {
2147 Opcode::Ishl => {
2148 emit_shl_i128(ctx, src, dst, amt_src);
2149 }
2150 Opcode::Ushr => {
2151 emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
2152 }
2153 Opcode::Sshr => {
2154 emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
2155 }
2156 Opcode::Rotl => {
2157 // (mov tmp, src)
2158 // (shl.i128 tmp, amt)
2159 // (mov dst, src)
2160 // (ushr.i128 dst, 128-amt)
2161 // (or dst, tmp)
2162 let tmp = ctx.alloc_tmp(types::I128);
2163 emit_shl_i128(ctx, src, tmp, amt_src);
2164 let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2165 ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
2166 ctx.emit(Inst::alu_rmi_r(
2167 OperandSize::Size64,
2168 AluRmiROpcode::Sub,
2169 RegMemImm::reg(amt_src),
2170 inv_amt,
2171 ));
2172 emit_shr_i128(
2173 ctx,
2174 src,
2175 dst,
2176 inv_amt.to_reg(),
2177 /* is_signed = */ false,
2178 );
2179 ctx.emit(Inst::alu_rmi_r(
2180 OperandSize::Size64,
2181 AluRmiROpcode::Or,
2182 RegMemImm::reg(tmp.regs()[0].to_reg()),
2183 dst.regs()[0],
2184 ));
2185 ctx.emit(Inst::alu_rmi_r(
2186 OperandSize::Size64,
2187 AluRmiROpcode::Or,
2188 RegMemImm::reg(tmp.regs()[1].to_reg()),
2189 dst.regs()[1],
2190 ));
2191 }
2192 Opcode::Rotr => {
2193 // (mov tmp, src)
2194 // (ushr.i128 tmp, amt)
2195 // (mov dst, src)
2196 // (shl.i128 dst, 128-amt)
2197 // (or dst, tmp)
2198 let tmp = ctx.alloc_tmp(types::I128);
2199 emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
2200 let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2201 ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
2202 ctx.emit(Inst::alu_rmi_r(
2203 OperandSize::Size64,
2204 AluRmiROpcode::Sub,
2205 RegMemImm::reg(amt_src),
2206 inv_amt,
2207 ));
2208 emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
2209 ctx.emit(Inst::alu_rmi_r(
2210 OperandSize::Size64,
2211 AluRmiROpcode::Or,
2212 RegMemImm::reg(tmp.regs()[0].to_reg()),
2213 dst.regs()[0],
2214 ));
2215 ctx.emit(Inst::alu_rmi_r(
2216 OperandSize::Size64,
2217 AluRmiROpcode::Or,
2218 RegMemImm::reg(tmp.regs()[1].to_reg()),
2219 dst.regs()[1],
2220 ));
2221 }
2222 _ => unreachable!(),
2223 }
2224 } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
2225 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
2226 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
2227 // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
2228 // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
2229 let src = put_input_in_reg(ctx, inputs[0]);
2230 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2231 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2232
2233 // If necessary, move the shift index into the lowest bits of a vector register.
2234 let shift_by_moved = match &shift_by {
2235 RegMemImm::Imm { .. } => shift_by.clone(),
2236 RegMemImm::Reg { reg } => {
2237 let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2238 ctx.emit(Inst::gpr_to_xmm(
2239 SseOpcode::Movd,
2240 RegMem::reg(*reg),
2241 OperandSize::Size32,
2242 tmp_shift_by,
2243 ));
2244 RegMemImm::reg(tmp_shift_by.to_reg())
2245 }
2246 RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2247 };
2248
2249 // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
2250 // the others must be fixed up with the mask below.
2251 let shift_opcode = match op {
2252 Opcode::Ishl => SseOpcode::Psllw,
2253 Opcode::Ushr => SseOpcode::Psrlw,
2254 _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2255 };
2256 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2257 ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst));
2258
2259 // Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix
2260 // up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt
2261 // forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement),
2262 // we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all
2263 // bits except the MSB so we mask with `0x7f`; etc.
2264 const USHR_MASKS: [u8; 128] = [
2265 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2266 0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
2267 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
2268 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f,
2269 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f,
2270 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
2271 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
2272 0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
2273 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
2274 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2275 ];
2276 const SHL_MASKS: [u8; 128] = [
2277 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2278 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
2279 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
2280 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
2281 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
2282 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
2283 0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
2284 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
2285 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
2286 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2287 ];
2288 let mask = match op {
2289 Opcode::Ishl => &SHL_MASKS,
2290 Opcode::Ushr => &USHR_MASKS,
2291 _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2292 };
2293
2294 // Figure out the address of the shift mask.
2295 let mask_address = match shift_by {
2296 RegMemImm::Imm { simm32 } => {
2297 // When the shift amount is known, we can statically (i.e. at compile time) determine the mask to
2298 // use and only emit that.
2299 debug_assert!(simm32 < 8);
2300 let mask_offset = simm32 as usize * 16;
2301 let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(
2302 &mask[mask_offset..mask_offset + 16],
2303 ));
2304 SyntheticAmode::ConstantOffset(mask_constant)
2305 }
2306 RegMemImm::Reg { reg } => {
2307 // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
2308 // mask offset in the table. We do this use LEA to find the base address of the mask table and then
2309 // complex addressing to offset to the right mask: `base_address + shift_by * 4`
2310 let base_mask_address = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2311 let mask_offset = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2312 let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
2313 ctx.emit(Inst::lea(
2314 SyntheticAmode::ConstantOffset(mask_constant),
2315 base_mask_address,
2316 ));
2317 ctx.emit(Inst::gen_move(mask_offset, reg, types::I64));
2318 ctx.emit(Inst::shift_r(
2319 OperandSize::Size64,
2320 ShiftKind::ShiftLeft,
2321 Some(4),
2322 mask_offset,
2323 ));
2324 Amode::imm_reg_reg_shift(
2325 0,
2326 base_mask_address.to_reg(),
2327 mask_offset.to_reg(),
2328 0,
2329 )
2330 .into()
2331 }
2332 RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"),
2333 };
2334
2335 // Load the mask into a temporary register, `mask_value`.
2336 let mask_value = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2337 ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));
2338
2339 // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
2340 // this AND instruction could be coalesced with the load above.
2341 let sse_op = match dst_ty {
2342 types::F32X4 => SseOpcode::Andps,
2343 types::F64X2 => SseOpcode::Andpd,
2344 _ => SseOpcode::Pand,
2345 };
2346 ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
2347 } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
2348 // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
2349 // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
2350 // approach here: separate the low and high lanes, shift them separately, and merge them into the final
2351 // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
2352 // low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2353 // shifted_low.i16x8 = shift each lane of `low`
2354 // high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2355 // shifted_high.i16x8 = shift each lane of `high`
2356 // dst.i8x16 = [s0'', s1'', ..., s15'']
2357 let src = put_input_in_reg(ctx, inputs[0]);
2358 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2359 let shift_by_ty = ctx.input_ty(insn, 1);
2360 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2361
2362 // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
2363 // bits, relying on PSRAW to fill in the upper bits appropriately.
2364 let bigger_shift_by = match shift_by {
2365 // When we know the shift amount at compile time, we add the extra shift amount statically.
2366 RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
2367 // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
2368 // register.
2369 RegMemImm::Reg { reg } => {
2370 let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
2371 ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
2372
2373 let size = if shift_by_ty == types::I64 {
2374 OperandSize::Size64
2375 } else {
2376 OperandSize::Size32
2377 };
2378 let imm = RegMemImm::imm(8);
2379 ctx.emit(Inst::alu_rmi_r(
2380 size,
2381 AluRmiROpcode::Add,
2382 imm,
2383 bigger_shift_by_gpr,
2384 ));
2385
2386 let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2387 ctx.emit(Inst::gpr_to_xmm(
2388 SseOpcode::Movd,
2389 RegMem::from(bigger_shift_by_gpr),
2390 OperandSize::Size32,
2391 bigger_shift_by_xmm,
2392 ));
2393 RegMemImm::reg(bigger_shift_by_xmm.to_reg())
2394 }
2395 RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2396 };
2397
2398 // Unpack and shift the lower lanes of `src` into the `dst` register.
2399 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2400 ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
2401 ctx.emit(Inst::xmm_rmi_reg(
2402 SseOpcode::Psraw,
2403 bigger_shift_by.clone(),
2404 dst,
2405 ));
2406
2407 // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
2408 let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2409 ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
2410 ctx.emit(Inst::xmm_rm_r(
2411 SseOpcode::Punpckhbw,
2412 RegMem::from(upper_lanes),
2413 upper_lanes,
2414 ));
2415 ctx.emit(Inst::xmm_rmi_reg(
2416 SseOpcode::Psraw,
2417 bigger_shift_by,
2418 upper_lanes,
2419 ));
2420
2421 // Merge the upper and lower shifted lanes into `dst`.
2422 ctx.emit(Inst::xmm_rm_r(
2423 SseOpcode::Packsswb,
2424 RegMem::from(upper_lanes),
2425 dst,
2426 ));
2427 } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
2428 // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
2429 // like AVX512VL and AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
2430 // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
2431 // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
2432 // scalar instruction, and insert the shifted values back in the `dst` XMM register.
2433 let src = put_input_in_reg(ctx, inputs[0]);
2434 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2435 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2436
2437 // Extract the upper and lower lanes into temporary GPRs.
2438 let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2439 emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
2440 let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2441 emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
2442
2443 // Shift each value.
2444 let mut shift = |reg: Writable<Reg>| {
2445 let kind = ShiftKind::ShiftRightArithmetic;
2446 if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
2447 // Mask the shift amount according to Cranelift's semantics.
2448 let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
2449 ctx.emit(Inst::shift_r(
2450 OperandSize::Size64,
2451 kind,
2452 Some(shift_by),
2453 reg,
2454 ));
2455 } else {
2456 let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
2457 let w_rcx = Writable::from_reg(regs::rcx());
2458 ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
2459 ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
2460 };
2461 };
2462 shift(lower_lane);
2463 shift(upper_lane);
2464
2465 // Insert the scalar values back into the `dst` vector.
2466 emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
2467 emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
2468 } else {
2469 // For the remaining packed shifts not covered above, x86 has implementations that can either:
2470 // - shift using an immediate
2471 // - shift using a dynamic value given in the lower bits of another XMM register.
2472 let src = put_input_in_reg(ctx, inputs[0]);
2473 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
2474 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2475 let sse_op = match dst_ty {
2476 types::I16X8 => match op {
2477 Opcode::Ishl => SseOpcode::Psllw,
2478 Opcode::Ushr => SseOpcode::Psrlw,
2479 Opcode::Sshr => SseOpcode::Psraw,
2480 _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2481 },
2482 types::I32X4 => match op {
2483 Opcode::Ishl => SseOpcode::Pslld,
2484 Opcode::Ushr => SseOpcode::Psrld,
2485 Opcode::Sshr => SseOpcode::Psrad,
2486 _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2487 },
2488 types::I64X2 => match op {
2489 Opcode::Ishl => SseOpcode::Psllq,
2490 Opcode::Ushr => SseOpcode::Psrlq,
2491 _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
2492 },
2493 _ => unreachable!(),
2494 };
2495
2496 // If necessary, move the shift index into the lowest bits of a vector register.
2497 let shift_by = match shift_by {
2498 RegMemImm::Imm { .. } => shift_by,
2499 RegMemImm::Reg { reg } => {
2500 let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
2501 ctx.emit(Inst::gpr_to_xmm(
2502 SseOpcode::Movd,
2503 RegMem::reg(reg),
2504 OperandSize::Size32,
2505 tmp_shift_by,
2506 ));
2507 RegMemImm::reg(tmp_shift_by.to_reg())
2508 }
2509 RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
2510 };
2511
2512 // Move the `src` to the same register as `dst`.
2513 ctx.emit(Inst::gen_move(dst, src, dst_ty));
2514
2515 ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst));
2516 }
2517 }
2518
2519 Opcode::Ineg => {
2520 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2521 let ty = ty.unwrap();
2522
2523 if ty.is_vector() {
2524 // Zero's out a register and then does a packed subtraction
2525 // of the input from the register.
2526
2527 let src = input_to_reg_mem(ctx, inputs[0]);
2528 let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
2529
2530 let subtract_opcode = match ty {
2531 types::I8X16 => SseOpcode::Psubb,
2532 types::I16X8 => SseOpcode::Psubw,
2533 types::I32X4 => SseOpcode::Psubd,
2534 types::I64X2 => SseOpcode::Psubq,
2535 _ => panic!("Unsupported type for Ineg instruction, found {}", ty),
2536 };
2537
2538 // Note we must zero out a tmp instead of using the destination register since
2539 // the desitnation could be an alias for the source input register
2540 ctx.emit(Inst::xmm_rm_r(
2541 SseOpcode::Pxor,
2542 RegMem::reg(tmp.to_reg()),
2543 tmp,
2544 ));
2545 ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
2546 ctx.emit(Inst::xmm_unary_rm_r(
2547 SseOpcode::Movapd,
2548 RegMem::reg(tmp.to_reg()),
2549 dst,
2550 ));
2551 } else {
2552 let src = put_input_in_reg(ctx, inputs[0]);
2553 ctx.emit(Inst::gen_move(dst, src, ty));
2554 ctx.emit(Inst::neg(OperandSize::from_ty(ty), dst));
2555 }
2556 }
2557
2558 Opcode::Clz => {
2559 let orig_ty = ty.unwrap();
2560
2561 if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) {
2562 // We can use a plain lzcnt instruction here. Note no special handling is required
2563 // for zero inputs, because the machine instruction does what the CLIF expects for
2564 // zero, i.e. it returns zero.
2565 let src = input_to_reg_mem(ctx, inputs[0]);
2566 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2567 ctx.emit(Inst::unary_rm_r(
2568 OperandSize::from_ty(orig_ty),
2569 UnaryRmROpcode::Lzcnt,
2570 src,
2571 dst,
2572 ));
2573 return Ok(());
2574 }
2575
2576 // General formula using bit-scan reverse (BSR):
2577 // mov -1, %dst
2578 // bsr %src, %tmp
2579 // cmovz %dst, %tmp
2580 // mov $(size_bits - 1), %dst
2581 // sub %tmp, %dst
2582
2583 if orig_ty == types::I128 {
2584 // clz upper, tmp1
2585 // clz lower, dst
2586 // add dst, 64
2587 // cmp tmp1, 64
2588 // cmovnz tmp1, dst
2589 let dsts = get_output_reg(ctx, outputs[0]);
2590 let dst = dsts.regs()[0];
2591 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2592 let srcs = put_input_in_regs(ctx, inputs[0]);
2593 let src_lo = srcs.regs()[0];
2594 let src_hi = srcs.regs()[1];
2595 emit_clz(ctx, types::I64, types::I64, src_hi, tmp1);
2596 emit_clz(ctx, types::I64, types::I64, src_lo, dst);
2597 ctx.emit(Inst::alu_rmi_r(
2598 OperandSize::Size64,
2599 AluRmiROpcode::Add,
2600 RegMemImm::imm(64),
2601 dst,
2602 ));
2603 ctx.emit(Inst::cmp_rmi_r(
2604 OperandSize::Size64,
2605 RegMemImm::imm(64),
2606 tmp1.to_reg(),
2607 ));
2608 ctx.emit(Inst::cmove(
2609 OperandSize::Size64,
2610 CC::NZ,
2611 RegMem::reg(tmp1.to_reg()),
2612 dst,
2613 ));
2614 ctx.emit(Inst::alu_rmi_r(
2615 OperandSize::Size64,
2616 AluRmiROpcode::Xor,
2617 RegMemImm::reg(dsts.regs()[1].to_reg()),
2618 dsts.regs()[1],
2619 ));
2620 } else {
2621 let (ext_spec, ty) = match orig_ty {
2622 types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
2623 a if a == types::I32 || a == types::I64 => (None, a),
2624 _ => unreachable!(),
2625 };
2626 let src = if let Some(ext_spec) = ext_spec {
2627 extend_input_to_reg(ctx, inputs[0], ext_spec)
2628 } else {
2629 put_input_in_reg(ctx, inputs[0])
2630 };
2631
2632 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2633 emit_clz(ctx, orig_ty, ty, src, dst);
2634 }
2635 }
2636
2637 Opcode::Ctz => {
2638 let orig_ty = ctx.input_ty(insn, 0);
2639
2640 if isa_flags.use_bmi1() && (orig_ty == types::I32 || orig_ty == types::I64) {
2641 // We can use a plain tzcnt instruction here. Note no special handling is required
2642 // for zero inputs, because the machine instruction does what the CLIF expects for
2643 // zero, i.e. it returns zero.
2644 let src = input_to_reg_mem(ctx, inputs[0]);
2645 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2646 ctx.emit(Inst::unary_rm_r(
2647 OperandSize::from_ty(orig_ty),
2648 UnaryRmROpcode::Tzcnt,
2649 src,
2650 dst,
2651 ));
2652 return Ok(());
2653 }
2654
2655 // General formula using bit-scan forward (BSF):
2656 // bsf %src, %dst
2657 // mov $(size_bits), %tmp
2658 // cmovz %tmp, %dst
2659 if orig_ty == types::I128 {
2660 // ctz src_lo, dst
2661 // ctz src_hi, tmp1
2662 // add tmp1, 64
2663 // cmp dst, 64
2664 // cmovz tmp1, dst
2665 let dsts = get_output_reg(ctx, outputs[0]);
2666 let dst = dsts.regs()[0];
2667 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2668 let srcs = put_input_in_regs(ctx, inputs[0]);
2669 let src_lo = srcs.regs()[0];
2670 let src_hi = srcs.regs()[1];
2671 emit_ctz(ctx, types::I64, types::I64, src_lo, dst);
2672 emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1);
2673 ctx.emit(Inst::alu_rmi_r(
2674 OperandSize::Size64,
2675 AluRmiROpcode::Add,
2676 RegMemImm::imm(64),
2677 tmp1,
2678 ));
2679 ctx.emit(Inst::cmp_rmi_r(
2680 OperandSize::Size64,
2681 RegMemImm::imm(64),
2682 dst.to_reg(),
2683 ));
2684 ctx.emit(Inst::cmove(
2685 OperandSize::Size64,
2686 CC::Z,
2687 RegMem::reg(tmp1.to_reg()),
2688 dst,
2689 ));
2690 ctx.emit(Inst::alu_rmi_r(
2691 OperandSize::Size64,
2692 AluRmiROpcode::Xor,
2693 RegMemImm::reg(dsts.regs()[1].to_reg()),
2694 dsts.regs()[1],
2695 ));
2696 } else {
2697 let ty = if orig_ty.bits() < 32 {
2698 types::I32
2699 } else {
2700 orig_ty
2701 };
2702 debug_assert!(ty == types::I32 || ty == types::I64);
2703
2704 let src = put_input_in_reg(ctx, inputs[0]);
2705 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2706 emit_ctz(ctx, orig_ty, ty, src, dst);
2707 }
2708 }
2709
2710 Opcode::Popcnt => {
2711 let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
2712 types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
2713 a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a),
2714 _ => unreachable!(),
2715 };
2716
2717 if isa_flags.use_popcnt() {
2718 match ty {
2719 types::I32 | types::I64 => {
2720 let src = input_to_reg_mem(ctx, inputs[0]);
2721 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2722 ctx.emit(Inst::unary_rm_r(
2723 OperandSize::from_ty(ty),
2724 UnaryRmROpcode::Popcnt,
2725 src,
2726 dst,
2727 ));
2728 return Ok(());
2729 }
2730
2731 types::I128 => {
2732 // The number of ones in a 128-bits value is the plain sum of the number of
2733 // ones in its low and high parts. No risk of overflow here.
2734 let dsts = get_output_reg(ctx, outputs[0]);
2735 let dst = dsts.regs()[0];
2736 let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2737 let srcs = put_input_in_regs(ctx, inputs[0]);
2738 let src_lo = srcs.regs()[0];
2739 let src_hi = srcs.regs()[1];
2740
2741 ctx.emit(Inst::unary_rm_r(
2742 OperandSize::Size64,
2743 UnaryRmROpcode::Popcnt,
2744 RegMem::reg(src_lo),
2745 dst,
2746 ));
2747 ctx.emit(Inst::unary_rm_r(
2748 OperandSize::Size64,
2749 UnaryRmROpcode::Popcnt,
2750 RegMem::reg(src_hi),
2751 tmp,
2752 ));
2753 ctx.emit(Inst::alu_rmi_r(
2754 OperandSize::Size64,
2755 AluRmiROpcode::Add,
2756 RegMemImm::reg(tmp.to_reg()),
2757 dst,
2758 ));
2759
2760 // Zero the result's high component.
2761 ctx.emit(Inst::alu_rmi_r(
2762 OperandSize::Size64,
2763 AluRmiROpcode::Xor,
2764 RegMemImm::reg(dsts.regs()[1].to_reg()),
2765 dsts.regs()[1],
2766 ));
2767
2768 return Ok(());
2769 }
2770 _ => {}
2771 }
2772 }
2773
2774 let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec {
2775 (
2776 smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))],
2777 ty,
2778 )
2779 } else if ty == types::I128 {
2780 let regs = put_input_in_regs(ctx, inputs[0]);
2781 (
2782 smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])],
2783 types::I64,
2784 )
2785 } else {
2786 // N.B.: explicitly put input in a reg here because the width of the instruction
2787 // into which this RM op goes may not match the width of the input type (in fact,
2788 // it won't for i32.popcnt), and we don't want a larger than necessary load.
2789 (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty)
2790 };
2791
2792 let mut dsts: SmallVec<[Reg; 2]> = smallvec![];
2793 for src in srcs {
2794 let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2795 dsts.push(dst.to_reg());
2796 if ty == types::I64 {
2797 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2798 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2799 let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2800
2801 // mov src, tmp1
2802 ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
2803
2804 // shr $1, tmp1
2805 ctx.emit(Inst::shift_r(
2806 OperandSize::Size64,
2807 ShiftKind::ShiftRightLogical,
2808 Some(1),
2809 tmp1,
2810 ));
2811
2812 // mov 0x7777_7777_7777_7777, cst
2813 ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
2814
2815 // andq cst, tmp1
2816 ctx.emit(Inst::alu_rmi_r(
2817 OperandSize::Size64,
2818 AluRmiROpcode::And,
2819 RegMemImm::reg(cst.to_reg()),
2820 tmp1,
2821 ));
2822
2823 // mov src, tmp2
2824 ctx.emit(Inst::mov64_rm_r(src, tmp2));
2825
2826 // sub tmp1, tmp2
2827 ctx.emit(Inst::alu_rmi_r(
2828 OperandSize::Size64,
2829 AluRmiROpcode::Sub,
2830 RegMemImm::reg(tmp1.to_reg()),
2831 tmp2,
2832 ));
2833
2834 // shr $1, tmp1
2835 ctx.emit(Inst::shift_r(
2836 OperandSize::Size64,
2837 ShiftKind::ShiftRightLogical,
2838 Some(1),
2839 tmp1,
2840 ));
2841
2842 // and cst, tmp1
2843 ctx.emit(Inst::alu_rmi_r(
2844 OperandSize::Size64,
2845 AluRmiROpcode::And,
2846 RegMemImm::reg(cst.to_reg()),
2847 tmp1,
2848 ));
2849
2850 // sub tmp1, tmp2
2851 ctx.emit(Inst::alu_rmi_r(
2852 OperandSize::Size64,
2853 AluRmiROpcode::Sub,
2854 RegMemImm::reg(tmp1.to_reg()),
2855 tmp2,
2856 ));
2857
2858 // shr $1, tmp1
2859 ctx.emit(Inst::shift_r(
2860 OperandSize::Size64,
2861 ShiftKind::ShiftRightLogical,
2862 Some(1),
2863 tmp1,
2864 ));
2865
2866 // and cst, tmp1
2867 ctx.emit(Inst::alu_rmi_r(
2868 OperandSize::Size64,
2869 AluRmiROpcode::And,
2870 RegMemImm::reg(cst.to_reg()),
2871 tmp1,
2872 ));
2873
2874 // sub tmp1, tmp2
2875 ctx.emit(Inst::alu_rmi_r(
2876 OperandSize::Size64,
2877 AluRmiROpcode::Sub,
2878 RegMemImm::reg(tmp1.to_reg()),
2879 tmp2,
2880 ));
2881
2882 // mov tmp2, dst
2883 ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
2884
2885 // shr $4, dst
2886 ctx.emit(Inst::shift_r(
2887 OperandSize::Size64,
2888 ShiftKind::ShiftRightLogical,
2889 Some(4),
2890 dst,
2891 ));
2892
2893 // add tmp2, dst
2894 ctx.emit(Inst::alu_rmi_r(
2895 OperandSize::Size64,
2896 AluRmiROpcode::Add,
2897 RegMemImm::reg(tmp2.to_reg()),
2898 dst,
2899 ));
2900
2901 // mov $0x0F0F_0F0F_0F0F_0F0F, cst
2902 ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
2903
2904 // and cst, dst
2905 ctx.emit(Inst::alu_rmi_r(
2906 OperandSize::Size64,
2907 AluRmiROpcode::And,
2908 RegMemImm::reg(cst.to_reg()),
2909 dst,
2910 ));
2911
2912 // mov $0x0101_0101_0101_0101, cst
2913 ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
2914
2915 // mul cst, dst
2916 ctx.emit(Inst::alu_rmi_r(
2917 OperandSize::Size64,
2918 AluRmiROpcode::Mul,
2919 RegMemImm::reg(cst.to_reg()),
2920 dst,
2921 ));
2922
2923 // shr $56, dst
2924 ctx.emit(Inst::shift_r(
2925 OperandSize::Size64,
2926 ShiftKind::ShiftRightLogical,
2927 Some(56),
2928 dst,
2929 ));
2930 } else {
2931 assert_eq!(ty, types::I32);
2932
2933 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2934 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
2935
2936 // mov src, tmp1
2937 ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
2938
2939 // shr $1, tmp1
2940 ctx.emit(Inst::shift_r(
2941 OperandSize::Size32,
2942 ShiftKind::ShiftRightLogical,
2943 Some(1),
2944 tmp1,
2945 ));
2946
2947 // andq $0x7777_7777, tmp1
2948 ctx.emit(Inst::alu_rmi_r(
2949 OperandSize::Size32,
2950 AluRmiROpcode::And,
2951 RegMemImm::imm(0x77777777),
2952 tmp1,
2953 ));
2954
2955 // mov src, tmp2
2956 ctx.emit(Inst::mov64_rm_r(src, tmp2));
2957
2958 // sub tmp1, tmp2
2959 ctx.emit(Inst::alu_rmi_r(
2960 OperandSize::Size32,
2961 AluRmiROpcode::Sub,
2962 RegMemImm::reg(tmp1.to_reg()),
2963 tmp2,
2964 ));
2965
2966 // shr $1, tmp1
2967 ctx.emit(Inst::shift_r(
2968 OperandSize::Size32,
2969 ShiftKind::ShiftRightLogical,
2970 Some(1),
2971 tmp1,
2972 ));
2973
2974 // and 0x7777_7777, tmp1
2975 ctx.emit(Inst::alu_rmi_r(
2976 OperandSize::Size32,
2977 AluRmiROpcode::And,
2978 RegMemImm::imm(0x77777777),
2979 tmp1,
2980 ));
2981
2982 // sub tmp1, tmp2
2983 ctx.emit(Inst::alu_rmi_r(
2984 OperandSize::Size32,
2985 AluRmiROpcode::Sub,
2986 RegMemImm::reg(tmp1.to_reg()),
2987 tmp2,
2988 ));
2989
2990 // shr $1, tmp1
2991 ctx.emit(Inst::shift_r(
2992 OperandSize::Size32,
2993 ShiftKind::ShiftRightLogical,
2994 Some(1),
2995 tmp1,
2996 ));
2997
2998 // and $0x7777_7777, tmp1
2999 ctx.emit(Inst::alu_rmi_r(
3000 OperandSize::Size32,
3001 AluRmiROpcode::And,
3002 RegMemImm::imm(0x77777777),
3003 tmp1,
3004 ));
3005
3006 // sub tmp1, tmp2
3007 ctx.emit(Inst::alu_rmi_r(
3008 OperandSize::Size32,
3009 AluRmiROpcode::Sub,
3010 RegMemImm::reg(tmp1.to_reg()),
3011 tmp2,
3012 ));
3013
3014 // mov tmp2, dst
3015 ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
3016
3017 // shr $4, dst
3018 ctx.emit(Inst::shift_r(
3019 OperandSize::Size32,
3020 ShiftKind::ShiftRightLogical,
3021 Some(4),
3022 dst,
3023 ));
3024
3025 // add tmp2, dst
3026 ctx.emit(Inst::alu_rmi_r(
3027 OperandSize::Size32,
3028 AluRmiROpcode::Add,
3029 RegMemImm::reg(tmp2.to_reg()),
3030 dst,
3031 ));
3032
3033 // and $0x0F0F_0F0F, dst
3034 ctx.emit(Inst::alu_rmi_r(
3035 OperandSize::Size32,
3036 AluRmiROpcode::And,
3037 RegMemImm::imm(0x0F0F0F0F),
3038 dst,
3039 ));
3040
3041 // mul $0x0101_0101, dst
3042 ctx.emit(Inst::alu_rmi_r(
3043 OperandSize::Size32,
3044 AluRmiROpcode::Mul,
3045 RegMemImm::imm(0x01010101),
3046 dst,
3047 ));
3048
3049 // shr $24, dst
3050 ctx.emit(Inst::shift_r(
3051 OperandSize::Size32,
3052 ShiftKind::ShiftRightLogical,
3053 Some(24),
3054 dst,
3055 ));
3056 }
3057 }
3058
3059 if dsts.len() == 1 {
3060 let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3061 ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64));
3062 } else {
3063 assert!(dsts.len() == 2);
3064 let final_dst = get_output_reg(ctx, outputs[0]);
3065 ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64));
3066 ctx.emit(Inst::alu_rmi_r(
3067 OperandSize::Size64,
3068 AluRmiROpcode::Add,
3069 RegMemImm::reg(dsts[1]),
3070 final_dst.regs()[0],
3071 ));
3072 ctx.emit(Inst::alu_rmi_r(
3073 OperandSize::Size64,
3074 AluRmiROpcode::Xor,
3075 RegMemImm::reg(final_dst.regs()[1].to_reg()),
3076 final_dst.regs()[1],
3077 ));
3078 }
3079 }
3080
3081 Opcode::Bitrev => {
3082 let ty = ctx.input_ty(insn, 0);
3083 assert!(
3084 ty == types::I8
3085 || ty == types::I16
3086 || ty == types::I32
3087 || ty == types::I64
3088 || ty == types::I128
3089 );
3090
3091 if ty == types::I128 {
3092 let src = put_input_in_regs(ctx, inputs[0]);
3093 let dst = get_output_reg(ctx, outputs[0]);
3094 emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64);
3095 emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64);
3096 } else {
3097 let src = put_input_in_reg(ctx, inputs[0]);
3098 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3099 emit_bitrev(ctx, src, dst, ty);
3100 }
3101 }
3102
3103 Opcode::IsNull | Opcode::IsInvalid => {
3104 // Null references are represented by the constant value 0; invalid references are
3105 // represented by the constant value -1. See `define_reftypes()` in
3106 // `meta/src/isa/x86/encodings.rs` to confirm.
3107 let src = put_input_in_reg(ctx, inputs[0]);
3108 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3109 let ty = ctx.input_ty(insn, 0);
3110 let imm = match op {
3111 Opcode::IsNull => {
3112 // TODO could use tst src, src for IsNull
3113 0
3114 }
3115 Opcode::IsInvalid => {
3116 // We can do a 32-bit comparison even in 64-bits mode, as the constant is then
3117 // sign-extended.
3118 0xffffffff
3119 }
3120 _ => unreachable!(),
3121 };
3122 ctx.emit(Inst::cmp_rmi_r(
3123 OperandSize::from_ty(ty),
3124 RegMemImm::imm(imm),
3125 src,
3126 ));
3127 ctx.emit(Inst::setcc(CC::Z, dst));
3128 }
3129
3130 Opcode::Uextend
3131 | Opcode::Sextend
3132 | Opcode::Bint
3133 | Opcode::Breduce
3134 | Opcode::Bextend
3135 | Opcode::Ireduce => {
3136 let src_ty = ctx.input_ty(insn, 0);
3137 let dst_ty = ctx.output_ty(insn, 0);
3138
3139 if src_ty == types::I128 {
3140 assert!(dst_ty.bits() <= 64);
3141 assert!(op == Opcode::Ireduce);
3142 let src = put_input_in_regs(ctx, inputs[0]);
3143 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3144 ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64));
3145 } else if dst_ty == types::I128 {
3146 assert!(src_ty.bits() <= 64);
3147 let src = put_input_in_reg(ctx, inputs[0]);
3148 let dst = get_output_reg(ctx, outputs[0]);
3149 assert!(op == Opcode::Uextend || op == Opcode::Sextend || op == Opcode::Bint);
3150 // Extend to 64 bits first.
3151
3152 let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64);
3153 if let Some(ext_mode) = ext_mode {
3154 if op == Opcode::Sextend {
3155 ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
3156 } else {
3157 ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
3158 }
3159 } else {
3160 ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0]));
3161 }
3162
3163 // Now generate the top 64 bits.
3164 if op == Opcode::Sextend {
3165 // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits
3166 // to spread the sign bit across all bits.
3167 ctx.emit(Inst::gen_move(
3168 dst.regs()[1],
3169 dst.regs()[0].to_reg(),
3170 types::I64,
3171 ));
3172 ctx.emit(Inst::shift_r(
3173 OperandSize::Size64,
3174 ShiftKind::ShiftRightArithmetic,
3175 Some(63),
3176 dst.regs()[1],
3177 ));
3178 } else {
3179 // Zero-extend: just zero the top word.
3180 ctx.emit(Inst::alu_rmi_r(
3181 OperandSize::Size64,
3182 AluRmiROpcode::Xor,
3183 RegMemImm::reg(dst.regs()[1].to_reg()),
3184 dst.regs()[1],
3185 ));
3186 }
3187 } else {
3188 // Sextend requires a sign-extended move, but all the other opcodes are simply a move
3189 // from a zero-extended source. Here is why this works, in each case:
3190 //
3191 // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
3192 // zero-extend here.
3193 //
3194 // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
3195 // again, this is a zero-extend / no-op.
3196 //
3197 // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
3198 // high-order bits, so we can simply do a copy.
3199 if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
3200 // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
3201 // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
3202 // zero-extended move in this case.
3203 // TODO add loads and shifts here.
3204 if let Some(_) = matches_input_any(
3205 ctx,
3206 inputs[0],
3207 &[
3208 Opcode::Iadd,
3209 Opcode::IaddIfcout,
3210 Opcode::Isub,
3211 Opcode::Imul,
3212 Opcode::Band,
3213 Opcode::Bor,
3214 Opcode::Bxor,
3215 ],
3216 ) {
3217 let src = put_input_in_reg(ctx, inputs[0]);
3218 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3219 ctx.emit(Inst::gen_move(dst, src, types::I64));
3220 return Ok(());
3221 }
3222 }
3223
3224 let src = input_to_reg_mem(ctx, inputs[0]);
3225 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3226
3227 let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
3228 assert_eq!(
3229 src_ty.bits() < dst_ty.bits(),
3230 ext_mode.is_some(),
3231 "unexpected extension: {} -> {}",
3232 src_ty,
3233 dst_ty
3234 );
3235
3236 if let Some(ext_mode) = ext_mode {
3237 if op == Opcode::Sextend {
3238 ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
3239 } else {
3240 ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
3241 }
3242 } else {
3243 ctx.emit(Inst::mov64_rm_r(src, dst));
3244 }
3245 }
3246 }
3247
3248 Opcode::Icmp => {
3249 let condcode = ctx.data(insn).cond_code().unwrap();
3250 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3251 let ty = ctx.input_ty(insn, 0);
3252 if !ty.is_vector() {
3253 let condcode = emit_cmp(ctx, insn, condcode);
3254 let cc = CC::from_intcc(condcode);
3255 ctx.emit(Inst::setcc(cc, dst));
3256 } else {
3257 assert_eq!(ty.bits(), 128);
3258 let eq = |ty| match ty {
3259 types::I8X16 => SseOpcode::Pcmpeqb,
3260 types::I16X8 => SseOpcode::Pcmpeqw,
3261 types::I32X4 => SseOpcode::Pcmpeqd,
3262 types::I64X2 => SseOpcode::Pcmpeqq,
3263 _ => panic!(
3264 "Unable to find an instruction for {} for type: {}",
3265 condcode, ty
3266 ),
3267 };
3268 let gt = |ty| match ty {
3269 types::I8X16 => SseOpcode::Pcmpgtb,
3270 types::I16X8 => SseOpcode::Pcmpgtw,
3271 types::I32X4 => SseOpcode::Pcmpgtd,
3272 types::I64X2 => SseOpcode::Pcmpgtq,
3273 _ => panic!(
3274 "Unable to find an instruction for {} for type: {}",
3275 condcode, ty
3276 ),
3277 };
3278 let maxu = |ty| match ty {
3279 types::I8X16 => SseOpcode::Pmaxub,
3280 types::I16X8 => SseOpcode::Pmaxuw,
3281 types::I32X4 => SseOpcode::Pmaxud,
3282 _ => panic!(
3283 "Unable to find an instruction for {} for type: {}",
3284 condcode, ty
3285 ),
3286 };
3287 let mins = |ty| match ty {
3288 types::I8X16 => SseOpcode::Pminsb,
3289 types::I16X8 => SseOpcode::Pminsw,
3290 types::I32X4 => SseOpcode::Pminsd,
3291 _ => panic!(
3292 "Unable to find an instruction for {} for type: {}",
3293 condcode, ty
3294 ),
3295 };
3296 let minu = |ty| match ty {
3297 types::I8X16 => SseOpcode::Pminub,
3298 types::I16X8 => SseOpcode::Pminuw,
3299 types::I32X4 => SseOpcode::Pminud,
3300 _ => panic!(
3301 "Unable to find an instruction for {} for type: {}",
3302 condcode, ty
3303 ),
3304 };
3305
3306 // Here we decide which operand to use as the read/write `dst` (ModRM reg field) and
3307 // which to use as the read `input` (ModRM r/m field). In the normal case we use
3308 // Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for the
3309 // less-than cases so that we can reuse the greater-than implementation.
3310 //
3311 // In a surprising twist, the operands for i64x2 `gte`/`sle` must also be flipped
3312 // from the normal order because of the special-case lowering for these instructions
3313 // (i.e. we use PCMPGTQ with flipped operands and negate the result).
3314 let input = match condcode {
3315 IntCC::SignedLessThanOrEqual if ty == types::I64X2 => {
3316 let lhs = put_input_in_reg(ctx, inputs[0]);
3317 let rhs = input_to_reg_mem(ctx, inputs[1]);
3318 ctx.emit(Inst::gen_move(dst, lhs, ty));
3319 rhs
3320 }
3321 IntCC::SignedGreaterThanOrEqual if ty == types::I64X2 => {
3322 let lhs = input_to_reg_mem(ctx, inputs[0]);
3323 let rhs = put_input_in_reg(ctx, inputs[1]);
3324 ctx.emit(Inst::gen_move(dst, rhs, ty));
3325 lhs
3326 }
3327 IntCC::SignedLessThan
3328 | IntCC::SignedLessThanOrEqual
3329 | IntCC::UnsignedLessThan
3330 | IntCC::UnsignedLessThanOrEqual => {
3331 let lhs = input_to_reg_mem(ctx, inputs[0]);
3332 let rhs = put_input_in_reg(ctx, inputs[1]);
3333 ctx.emit(Inst::gen_move(dst, rhs, ty));
3334 lhs
3335 }
3336 _ => {
3337 let lhs = put_input_in_reg(ctx, inputs[0]);
3338 let rhs = input_to_reg_mem(ctx, inputs[1]);
3339 ctx.emit(Inst::gen_move(dst, lhs, ty));
3340 rhs
3341 }
3342 };
3343
3344 match condcode {
3345 IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
3346 IntCC::NotEqual => {
3347 ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
3348 // Emit all 1s into the `tmp` register.
3349 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3350 ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3351 // Invert the result of the `PCMPEQ*`.
3352 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3353 }
3354 IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
3355 ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
3356 }
3357 IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
3358 if ty != types::I64X2 =>
3359 {
3360 ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
3361 ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
3362 }
3363 IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
3364 if ty == types::I64X2 =>
3365 {
3366 // The PMINS* instruction is only available in AVX512VL/F so we must instead
3367 // compare with flipped operands and negate the result (emitting one more
3368 // instruction).
3369 ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst));
3370 // Emit all 1s into the `tmp` register.
3371 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3372 ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3373 // Invert the result of the `PCMPGT*`.
3374 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3375 }
3376 IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
3377 ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
3378 ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
3379 // Emit all 1s into the `tmp` register.
3380 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
3381 ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
3382 // Invert the result of the `PCMPEQ*`.
3383 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
3384 }
3385 IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
3386 ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
3387 ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
3388 }
3389 _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
3390 }
3391 }
3392 }
3393
3394 Opcode::Fcmp => {
3395 let cond_code = ctx.data(insn).fp_cond_code().unwrap();
3396 let input_ty = ctx.input_ty(insn, 0);
3397 if !input_ty.is_vector() {
3398 // Unordered is returned by setting ZF, PF, CF <- 111
3399 // Greater than by ZF, PF, CF <- 000
3400 // Less than by ZF, PF, CF <- 001
3401 // Equal by ZF, PF, CF <- 100
3402 //
3403 // Checking the result of comiss is somewhat annoying because you don't have setcc
3404 // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
3405 // gt, etc) *and* orderedness.
3406 //
3407 // So that might mean we need more than one setcc check and then a logical "and" or
3408 // "or" to determine both, in some cases. However knowing that if the parity bit is
3409 // set, then the result was considered unordered and knowing that if the parity bit is
3410 // set, then both the ZF and CF flag bits must also be set we can get away with using
3411 // one setcc for most condition codes.
3412
3413 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3414
3415 match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
3416 FcmpCondResult::Condition(cc) => {
3417 ctx.emit(Inst::setcc(cc, dst));
3418 }
3419 FcmpCondResult::AndConditions(cc1, cc2) => {
3420 let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3421 ctx.emit(Inst::setcc(cc1, tmp));
3422 ctx.emit(Inst::setcc(cc2, dst));
3423 ctx.emit(Inst::alu_rmi_r(
3424 OperandSize::Size32,
3425 AluRmiROpcode::And,
3426 RegMemImm::reg(tmp.to_reg()),
3427 dst,
3428 ));
3429 }
3430 FcmpCondResult::OrConditions(cc1, cc2) => {
3431 let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3432 ctx.emit(Inst::setcc(cc1, tmp));
3433 ctx.emit(Inst::setcc(cc2, dst));
3434 ctx.emit(Inst::alu_rmi_r(
3435 OperandSize::Size32,
3436 AluRmiROpcode::Or,
3437 RegMemImm::reg(tmp.to_reg()),
3438 dst,
3439 ));
3440 }
3441 FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
3442 }
3443 } else {
3444 let op = match input_ty {
3445 types::F32X4 => SseOpcode::Cmpps,
3446 types::F64X2 => SseOpcode::Cmppd,
3447 _ => panic!("Bad input type to fcmp: {}", input_ty),
3448 };
3449
3450 // Since some packed comparisons are not available, some of the condition codes
3451 // must be inverted, with a corresponding `flip` of the operands.
3452 let (imm, flip) = match cond_code {
3453 FloatCC::GreaterThan => (FcmpImm::LessThan, true),
3454 FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
3455 FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
3456 FloatCC::UnorderedOrLessThanOrEqual => {
3457 (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
3458 }
3459 FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
3460 panic!("unsupported float condition code: {}", cond_code)
3461 }
3462 _ => (FcmpImm::from(cond_code), false),
3463 };
3464
3465 // Determine the operands of the comparison, possibly by flipping them.
3466 let (lhs, rhs) = if flip {
3467 (
3468 put_input_in_reg(ctx, inputs[1]),
3469 input_to_reg_mem(ctx, inputs[0]),
3470 )
3471 } else {
3472 (
3473 put_input_in_reg(ctx, inputs[0]),
3474 input_to_reg_mem(ctx, inputs[1]),
3475 )
3476 };
3477
3478 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
3479 // but ensures that the registers are the same to match x86's read-write operand
3480 // encoding.
3481 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3482 ctx.emit(Inst::gen_move(dst, lhs, input_ty));
3483
3484 // Emit the comparison.
3485 ctx.emit(Inst::xmm_rm_r_imm(
3486 op,
3487 rhs,
3488 dst,
3489 imm.encode(),
3490 OperandSize::Size32,
3491 ));
3492 }
3493 }
3494
3495 Opcode::FallthroughReturn | Opcode::Return => {
3496 for i in 0..ctx.num_inputs(insn) {
3497 let src_reg = put_input_in_regs(ctx, inputs[i]);
3498 let retval_reg = ctx.retval(i);
3499 let ty = ctx.input_ty(insn, i);
3500 assert!(src_reg.len() == retval_reg.len());
3501 let (_, tys) = Inst::rc_for_type(ty)?;
3502 for ((&src, &dst), &ty) in src_reg
3503 .regs()
3504 .iter()
3505 .zip(retval_reg.regs().iter())
3506 .zip(tys.iter())
3507 {
3508 ctx.emit(Inst::gen_move(dst, src, ty));
3509 }
3510 }
3511 // N.B.: the Ret itself is generated by the ABI.
3512 }
3513
3514 Opcode::Call | Opcode::CallIndirect => {
3515 let caller_conv = ctx.abi().call_conv();
3516 let (mut abi, inputs) = match op {
3517 Opcode::Call => {
3518 let (extname, dist) = ctx.call_target(insn).unwrap();
3519 let sig = ctx.call_sig(insn).unwrap();
3520 assert_eq!(inputs.len(), sig.params.len());
3521 assert_eq!(outputs.len(), sig.returns.len());
3522 (
3523 X64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
3524 &inputs[..],
3525 )
3526 }
3527
3528 Opcode::CallIndirect => {
3529 let ptr = put_input_in_reg(ctx, inputs[0]);
3530 let sig = ctx.call_sig(insn).unwrap();
3531 assert_eq!(inputs.len() - 1, sig.params.len());
3532 assert_eq!(outputs.len(), sig.returns.len());
3533 (
3534 X64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
3535 &inputs[1..],
3536 )
3537 }
3538
3539 _ => unreachable!(),
3540 };
3541
3542 abi.emit_stack_pre_adjust(ctx);
3543 assert_eq!(inputs.len(), abi.num_args());
3544 for i in abi.get_copy_to_arg_order() {
3545 let input = inputs[i];
3546 let arg_regs = put_input_in_regs(ctx, input);
3547 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
3548 }
3549 abi.emit_call(ctx);
3550 for (i, output) in outputs.iter().enumerate() {
3551 let retval_regs = get_output_reg(ctx, *output);
3552 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
3553 }
3554 abi.emit_stack_post_adjust(ctx);
3555 }
3556
3557 Opcode::Debugtrap => {
3558 ctx.emit(Inst::Hlt);
3559 }
3560
3561 Opcode::Trap | Opcode::ResumableTrap => {
3562 let trap_code = ctx.data(insn).trap_code().unwrap();
3563 ctx.emit_safepoint(Inst::Ud2 { trap_code });
3564 }
3565
3566 Opcode::Trapif | Opcode::Trapff => {
3567 let trap_code = ctx.data(insn).trap_code().unwrap();
3568
3569 if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
3570 let cond_code = ctx.data(insn).cond_code().unwrap();
3571 // The flags must not have been clobbered by any other instruction between the
3572 // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
3573 // simply use the flags here.
3574 let cc = CC::from_intcc(cond_code);
3575
3576 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
3577 } else if op == Opcode::Trapif {
3578 let cond_code = ctx.data(insn).cond_code().unwrap();
3579
3580 // Verification ensures that the input is always a single-def ifcmp.
3581 let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
3582 let cond_code = emit_cmp(ctx, ifcmp, cond_code);
3583 let cc = CC::from_intcc(cond_code);
3584
3585 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
3586 } else {
3587 let cond_code = ctx.data(insn).fp_cond_code().unwrap();
3588
3589 // Verification ensures that the input is always a single-def ffcmp.
3590 let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
3591
3592 match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
3593 FcmpCondResult::Condition(cc) => {
3594 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc })
3595 }
3596 FcmpCondResult::AndConditions(cc1, cc2) => {
3597 // A bit unfortunate, but materialize the flags in their own register, and
3598 // check against this.
3599 let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3600 let tmp2 = ctx.alloc_tmp(types::I32).only_reg().unwrap();
3601 ctx.emit(Inst::setcc(cc1, tmp));
3602 ctx.emit(Inst::setcc(cc2, tmp2));
3603 ctx.emit(Inst::alu_rmi_r(
3604 OperandSize::Size32,
3605 AluRmiROpcode::And,
3606 RegMemImm::reg(tmp.to_reg()),
3607 tmp2,
3608 ));
3609 ctx.emit_safepoint(Inst::TrapIf {
3610 trap_code,
3611 cc: CC::NZ,
3612 });
3613 }
3614 FcmpCondResult::OrConditions(cc1, cc2) => {
3615 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 });
3616 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 });
3617 }
3618 FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
3619 };
3620 };
3621 }
3622
3623 Opcode::F64const => {
3624 // TODO use cmpeqpd for all 1s.
3625 let value = ctx.get_constant(insn).unwrap();
3626 let dst = get_output_reg(ctx, outputs[0]);
3627 for inst in Inst::gen_constant(dst, value as u128, types::F64, |ty| {
3628 ctx.alloc_tmp(ty).only_reg().unwrap()
3629 }) {
3630 ctx.emit(inst);
3631 }
3632 }
3633
3634 Opcode::F32const => {
3635 // TODO use cmpeqps for all 1s.
3636 let value = ctx.get_constant(insn).unwrap();
3637 let dst = get_output_reg(ctx, outputs[0]);
3638 for inst in Inst::gen_constant(dst, value as u128, types::F32, |ty| {
3639 ctx.alloc_tmp(ty).only_reg().unwrap()
3640 }) {
3641 ctx.emit(inst);
3642 }
3643 }
3644
3645 Opcode::WideningPairwiseDotProductS => {
3646 let lhs = put_input_in_reg(ctx, inputs[0]);
3647 let rhs = input_to_reg_mem(ctx, inputs[1]);
3648 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3649 let ty = ty.unwrap();
3650
3651 ctx.emit(Inst::gen_move(dst, lhs, ty));
3652
3653 if ty == types::I32X4 {
3654 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
3655 } else {
3656 panic!(
3657 "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
3658 ty
3659 );
3660 }
3661 }
3662
3663 Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
3664 let lhs = put_input_in_reg(ctx, inputs[0]);
3665 // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3666 // must avoid merging a load here.
3667 let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
3668 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3669 let ty = ty.unwrap();
3670
3671 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
3672 // but ensures that the registers are the same to match x86's read-write operand
3673 // encoding.
3674 ctx.emit(Inst::gen_move(dst, lhs, ty));
3675
3676 // Note: min and max can't be handled here, because of the way Cranelift defines them:
3677 // if any operand is a NaN, they must return the NaN operand, while the x86 machine
3678 // instruction will return the second operand if either operand is a NaN.
3679 let sse_op = match ty {
3680 types::F32 => match op {
3681 Opcode::Fadd => SseOpcode::Addss,
3682 Opcode::Fsub => SseOpcode::Subss,
3683 Opcode::Fmul => SseOpcode::Mulss,
3684 Opcode::Fdiv => SseOpcode::Divss,
3685 _ => unreachable!(),
3686 },
3687 types::F64 => match op {
3688 Opcode::Fadd => SseOpcode::Addsd,
3689 Opcode::Fsub => SseOpcode::Subsd,
3690 Opcode::Fmul => SseOpcode::Mulsd,
3691 Opcode::Fdiv => SseOpcode::Divsd,
3692 _ => unreachable!(),
3693 },
3694 types::F32X4 => match op {
3695 Opcode::Fadd => SseOpcode::Addps,
3696 Opcode::Fsub => SseOpcode::Subps,
3697 Opcode::Fmul => SseOpcode::Mulps,
3698 Opcode::Fdiv => SseOpcode::Divps,
3699 _ => unreachable!(),
3700 },
3701 types::F64X2 => match op {
3702 Opcode::Fadd => SseOpcode::Addpd,
3703 Opcode::Fsub => SseOpcode::Subpd,
3704 Opcode::Fmul => SseOpcode::Mulpd,
3705 Opcode::Fdiv => SseOpcode::Divpd,
3706 _ => unreachable!(),
3707 },
3708 _ => panic!(
3709 "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
3710 ty
3711 ),
3712 };
3713 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
3714 }
3715
3716 Opcode::Fmin | Opcode::Fmax => {
3717 let lhs = put_input_in_reg(ctx, inputs[0]);
3718 let rhs = put_input_in_reg(ctx, inputs[1]);
3719 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3720 let is_min = op == Opcode::Fmin;
3721 let output_ty = ty.unwrap();
3722 ctx.emit(Inst::gen_move(dst, rhs, output_ty));
3723 if !output_ty.is_vector() {
3724 let op_size = match output_ty {
3725 types::F32 => OperandSize::Size32,
3726 types::F64 => OperandSize::Size64,
3727 _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
3728 };
3729 ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
3730 } else {
3731 // X64's implementation of floating point min and floating point max does not
3732 // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
3733 // scalar approach we use jumps to handle cases where NaN and +0 propagation is
3734 // not consistent with what is needed. However for packed floating point min and
3735 // floating point max we implement a different approach to avoid the sequence
3736 // of jumps that would be required on a per lane basis. Because we do not need to
3737 // lower labels and jumps but do need ctx for creating temporaries we implement
3738 // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
3739 // The outline of approach is as follows:
3740 //
3741 // First we preform the Min/Max in both directions. This is because in the
3742 // case of an operand's lane containing a NaN or in the case of the lanes of the
3743 // two operands containing 0 but with mismatched signs, x64 will return the second
3744 // operand regardless of its contents. So in order to make sure we capture NaNs and
3745 // normalize NaNs and 0 values we capture the operation in both directions and merge the
3746 // results. Then we normalize the results through operations that create a mask for the
3747 // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
3748 // 0s.
3749 //
3750 // The following sequence is generated for min:
3751 //
3752 // movap{s,d} %lhs, %tmp
3753 // minp{s,d} %dst, %tmp
3754 // minp,{s,d} %lhs, %dst
3755 // orp{s,d} %dst, %tmp
3756 // cmpp{s,d} %tmp, %dst, $3
3757 // orps{s,d} %dst, %tmp
3758 // psrl{s,d} {$10, $13}, %dst
3759 // andnp{s,d} %tmp, %dst
3760 //
3761 // and for max the sequence is:
3762 //
3763 // movap{s,d} %lhs, %tmp
3764 // minp{s,d} %dst, %tmp
3765 // minp,{s,d} %lhs, %dst
3766 // xorp{s,d} %tmp, %dst
3767 // orp{s,d} %dst, %tmp
3768 // subp{s,d} %dst, %tmp
3769 // cmpp{s,d} %tmp, %dst, $3
3770 // psrl{s,d} {$10, $13}, %dst
3771 // andnp{s,d} %tmp, %dst
3772
3773 if is_min {
3774 let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
3775 match output_ty {
3776 types::F32X4 => (
3777 SseOpcode::Movaps,
3778 SseOpcode::Minps,
3779 SseOpcode::Orps,
3780 SseOpcode::Cmpps,
3781 SseOpcode::Psrld,
3782 10,
3783 SseOpcode::Andnps,
3784 ),
3785 types::F64X2 => (
3786 SseOpcode::Movapd,
3787 SseOpcode::Minpd,
3788 SseOpcode::Orpd,
3789 SseOpcode::Cmppd,
3790 SseOpcode::Psrlq,
3791 13,
3792 SseOpcode::Andnpd,
3793 ),
3794 _ => unimplemented!("unsupported op type {:?}", output_ty),
3795 };
3796
3797 // Copy lhs into tmp
3798 let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
3799 ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
3800
3801 // Perform min in reverse direction
3802 ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
3803
3804 // Perform min in original direction
3805 ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
3806
3807 // X64 handles propagation of -0's and Nans differently between left and right
3808 // operands. After doing the min in both directions, this OR will
3809 // guarrentee capture of -0's and Nan in our tmp register
3810 ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
3811
3812 // Compare unordered to create mask for lanes containing NaNs and then use
3813 // that mask to saturate the NaN containing lanes in the tmp register with 1s.
3814 // TODO: Would a check for NaN and then a jump be better here in the
3815 // common case than continuing on to normalize NaNs that might not exist?
3816 let cond = FcmpImm::from(FloatCC::Unordered);
3817 ctx.emit(Inst::xmm_rm_r_imm(
3818 cmp_op,
3819 RegMem::reg(tmp_xmm1.to_reg()),
3820 dst,
3821 cond.encode(),
3822 OperandSize::Size32,
3823 ));
3824 ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3825
3826 // The dst register holds a mask for lanes containing NaNs.
3827 // We take that mask and shift in preparation for creating a different mask
3828 // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
3829 // number of least signficant bits. We shift right each lane by 10 bits
3830 // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
3831 // 11 exp. + 1 MSB sig.) for F64X2.
3832 ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
3833
3834 // Finally we do a nand with the tmp register to produce the final results
3835 // in the dst.
3836 ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3837 } else {
3838 let (
3839 mov_op,
3840 max_op,
3841 xor_op,
3842 or_op,
3843 sub_op,
3844 cmp_op,
3845 shift_op,
3846 shift_by,
3847 andn_op,
3848 ) = match output_ty {
3849 types::F32X4 => (
3850 SseOpcode::Movaps,
3851 SseOpcode::Maxps,
3852 SseOpcode::Xorps,
3853 SseOpcode::Orps,
3854 SseOpcode::Subps,
3855 SseOpcode::Cmpps,
3856 SseOpcode::Psrld,
3857 10,
3858 SseOpcode::Andnps,
3859 ),
3860 types::F64X2 => (
3861 SseOpcode::Movapd,
3862 SseOpcode::Maxpd,
3863 SseOpcode::Xorpd,
3864 SseOpcode::Orpd,
3865 SseOpcode::Subpd,
3866 SseOpcode::Cmppd,
3867 SseOpcode::Psrlq,
3868 13,
3869 SseOpcode::Andnpd,
3870 ),
3871 _ => unimplemented!("unsupported op type {:?}", output_ty),
3872 };
3873
3874 // Copy lhs into tmp.
3875 let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
3876 ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
3877
3878 // Perform max in reverse direction.
3879 ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3880
3881 // Perform max in original direction.
3882 ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
3883
3884 // Get the difference between the two results and store in tmp.
3885 // Max uses a different approach than min to account for potential
3886 // discrepancies with plus/minus 0.
3887 ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3888
3889 // X64 handles propagation of -0's and Nans differently between left and right
3890 // operands. After doing the max in both directions, this OR will
3891 // guarentee capture of 0's and Nan in our tmp register.
3892 ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3893
3894 // Capture NaNs and sign discrepancies.
3895 ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
3896
3897 // Compare unordered to create mask for lanes containing NaNs and then use
3898 // that mask to saturate the NaN containing lanes in the tmp register with 1s.
3899 let cond = FcmpImm::from(FloatCC::Unordered);
3900 ctx.emit(Inst::xmm_rm_r_imm(
3901 cmp_op,
3902 RegMem::reg(tmp_xmm1.to_reg()),
3903 dst,
3904 cond.encode(),
3905 OperandSize::Size32,
3906 ));
3907
3908 // The dst register holds a mask for lanes containing NaNs.
3909 // We take that mask and shift in preparation for creating a different mask
3910 // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
3911 // number of least signficant bits. We shift right each lane by 10 bits
3912 // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
3913 // 11 exp. + 1 MSB sig.) for F64X2.
3914 ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
3915
3916 // Finally we do a nand with the tmp register to produce the final results
3917 // in the dst.
3918 ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
3919 }
3920 }
3921 }
3922
3923 Opcode::FminPseudo | Opcode::FmaxPseudo => {
3924 // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3925 // must avoid merging a load here.
3926 let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3927 let rhs = put_input_in_reg(ctx, inputs[1]);
3928 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3929 let ty = ty.unwrap();
3930 ctx.emit(Inst::gen_move(dst, rhs, ty));
3931 let sse_opcode = match (ty, op) {
3932 (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
3933 (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
3934 (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
3935 (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
3936 _ => unimplemented!("unsupported type {} for {}", ty, op),
3937 };
3938 ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
3939 }
3940
3941 Opcode::Sqrt => {
3942 // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3943 // must avoid merging a load here.
3944 let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3945 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3946 let ty = ty.unwrap();
3947
3948 let sse_op = match ty {
3949 types::F32 => SseOpcode::Sqrtss,
3950 types::F64 => SseOpcode::Sqrtsd,
3951 types::F32X4 => SseOpcode::Sqrtps,
3952 types::F64X2 => SseOpcode::Sqrtpd,
3953 _ => panic!(
3954 "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
3955 ty
3956 ),
3957 };
3958
3959 ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
3960 }
3961
3962 Opcode::Fpromote => {
3963 // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3964 // must avoid merging a load here.
3965 let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3966 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3967 ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
3968 }
3969
3970 Opcode::Fdemote => {
3971 // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
3972 // must avoid merging a load here.
3973 let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
3974 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3975 ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
3976 }
3977
3978 Opcode::FcvtFromSint => {
3979 let output_ty = ty.unwrap();
3980 if !output_ty.is_vector() {
3981 let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
3982 types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
3983 types::I32 => (None, OperandSize::Size32),
3984 types::I64 => (None, OperandSize::Size64),
3985 _ => unreachable!(),
3986 };
3987
3988 let src = match ext_spec {
3989 Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
3990 None => RegMem::reg(put_input_in_reg(ctx, inputs[0])),
3991 };
3992
3993 let opcode = if output_ty == types::F32 {
3994 SseOpcode::Cvtsi2ss
3995 } else {
3996 assert_eq!(output_ty, types::F64);
3997 SseOpcode::Cvtsi2sd
3998 };
3999 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4000 ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
4001 } else {
4002 let ty = ty.unwrap();
4003 let src = put_input_in_reg(ctx, inputs[0]);
4004 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4005 let opcode = match ctx.input_ty(insn, 0) {
4006 types::I32X4 => SseOpcode::Cvtdq2ps,
4007 _ => {
4008 unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op)
4009 }
4010 };
4011 ctx.emit(Inst::gen_move(dst, src, ty));
4012 ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
4013 }
4014 }
4015 Opcode::FcvtLowFromSint => {
4016 let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
4017 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4018 ctx.emit(Inst::xmm_unary_rm_r(
4019 SseOpcode::Cvtdq2pd,
4020 RegMem::from(src),
4021 dst,
4022 ));
4023 }
4024 Opcode::FcvtFromUint => {
4025 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4026 let ty = ty.unwrap();
4027
4028 let input_ty = ctx.input_ty(insn, 0);
4029 if !ty.is_vector() {
4030 match input_ty {
4031 types::I8 | types::I16 | types::I32 => {
4032 // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
4033 // do a signed conversion (which won't overflow).
4034 let opcode = if ty == types::F32 {
4035 SseOpcode::Cvtsi2ss
4036 } else {
4037 assert_eq!(ty, types::F64);
4038 SseOpcode::Cvtsi2sd
4039 };
4040
4041 let src = RegMem::reg(extend_input_to_reg(
4042 ctx,
4043 inputs[0],
4044 ExtSpec::ZeroExtendTo64,
4045 ));
4046 ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
4047 }
4048
4049 types::I64 => {
4050 let src = put_input_in_reg(ctx, inputs[0]);
4051
4052 let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4053 ctx.emit(Inst::gen_move(src_copy, src, types::I64));
4054
4055 let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4056 let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
4057 ctx.emit(Inst::cvt_u64_to_float_seq(
4058 if ty == types::F64 {
4059 OperandSize::Size64
4060 } else {
4061 OperandSize::Size32
4062 },
4063 src_copy,
4064 tmp_gpr1,
4065 tmp_gpr2,
4066 dst,
4067 ));
4068 }
4069 _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
4070 };
4071 } else {
4072 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
4073 let src = put_input_in_reg(ctx, inputs[0]);
4074 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4075
4076 if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
4077 // When either AVX512VL or AVX512F are available,
4078 // `fcvt_from_uint` can be lowered to a single instruction.
4079 ctx.emit(Inst::xmm_unary_rm_r_evex(
4080 Avx512Opcode::Vcvtudq2ps,
4081 RegMem::reg(src),
4082 dst,
4083 ));
4084 } else {
4085 // Converting packed unsigned integers to packed floats
4086 // requires a few steps. There is no single instruction
4087 // lowering for converting unsigned floats but there is for
4088 // converting packed signed integers to float (cvtdq2ps). In
4089 // the steps below we isolate the upper half (16 bits) and
4090 // lower half (16 bits) of each lane and then we convert
4091 // each half separately using cvtdq2ps meant for signed
4092 // integers. In order for this to work for the upper half
4093 // bits we must shift right by 1 (divide by 2) these bits in
4094 // order to ensure the most significant bit is 0 not signed,
4095 // and then after the conversion we double the value.
4096 // Finally we add the converted values where addition will
4097 // correctly round.
4098 //
4099 // Sequence:
4100 // -> A = 0xffffffff
4101 // -> Ah = 0xffff0000
4102 // -> Al = 0x0000ffff
4103 // -> Convert(Al) // Convert int to float
4104 // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
4105 // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
4106 // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
4107 // -> dst = Ah + Al // Add the two floats together
4108
4109 // Create a temporary register
4110 let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4111 ctx.emit(Inst::xmm_unary_rm_r(
4112 SseOpcode::Movapd,
4113 RegMem::reg(src),
4114 tmp,
4115 ));
4116 ctx.emit(Inst::gen_move(dst, src, ty));
4117
4118 // Get the low 16 bits
4119 ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
4120 ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
4121
4122 // Get the high 16 bits
4123 ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
4124
4125 // Convert the low 16 bits
4126 ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
4127
4128 // Shift the high bits by 1, convert, and double to get the correct value.
4129 ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
4130 ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
4131 ctx.emit(Inst::xmm_rm_r(
4132 SseOpcode::Addps,
4133 RegMem::reg(dst.to_reg()),
4134 dst,
4135 ));
4136
4137 // Add together the two converted values.
4138 ctx.emit(Inst::xmm_rm_r(
4139 SseOpcode::Addps,
4140 RegMem::reg(tmp.to_reg()),
4141 dst,
4142 ));
4143 }
4144 }
4145 }
4146
4147 Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
4148 let src = put_input_in_reg(ctx, inputs[0]);
4149 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4150
4151 let input_ty = ctx.input_ty(insn, 0);
4152 if !input_ty.is_vector() {
4153 let src_size = if input_ty == types::F32 {
4154 OperandSize::Size32
4155 } else {
4156 assert_eq!(input_ty, types::F64);
4157 OperandSize::Size64
4158 };
4159
4160 let output_ty = ty.unwrap();
4161 let dst_size = if output_ty == types::I32 {
4162 OperandSize::Size32
4163 } else {
4164 assert_eq!(output_ty, types::I64);
4165 OperandSize::Size64
4166 };
4167
4168 let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
4169 let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
4170
4171 let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
4172 ctx.emit(Inst::gen_move(src_copy, src, input_ty));
4173
4174 let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
4175 let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();
4176
4177 if to_signed {
4178 ctx.emit(Inst::cvt_float_to_sint_seq(
4179 src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
4180 ));
4181 } else {
4182 ctx.emit(Inst::cvt_float_to_uint_seq(
4183 src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
4184 ));
4185 }
4186 } else {
4187 if op == Opcode::FcvtToSintSat {
4188 // Sets destination to zero if float is NaN
4189 assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
4190 let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4191 ctx.emit(Inst::xmm_unary_rm_r(
4192 SseOpcode::Movapd,
4193 RegMem::reg(src),
4194 tmp,
4195 ));
4196 ctx.emit(Inst::gen_move(dst, src, input_ty));
4197 let cond = FcmpImm::from(FloatCC::Equal);
4198 ctx.emit(Inst::xmm_rm_r_imm(
4199 SseOpcode::Cmpps,
4200 RegMem::reg(tmp.to_reg()),
4201 tmp,
4202 cond.encode(),
4203 OperandSize::Size32,
4204 ));
4205 ctx.emit(Inst::xmm_rm_r(
4206 SseOpcode::Andps,
4207 RegMem::reg(tmp.to_reg()),
4208 dst,
4209 ));
4210
4211 // Sets top bit of tmp if float is positive
4212 // Setting up to set top bit on negative float values
4213 ctx.emit(Inst::xmm_rm_r(
4214 SseOpcode::Pxor,
4215 RegMem::reg(dst.to_reg()),
4216 tmp,
4217 ));
4218
4219 // Convert the packed float to packed doubleword.
4220 ctx.emit(Inst::xmm_rm_r(
4221 SseOpcode::Cvttps2dq,
4222 RegMem::reg(dst.to_reg()),
4223 dst,
4224 ));
4225
4226 // Set top bit only if < 0
4227 // Saturate lane with sign (top) bit.
4228 ctx.emit(Inst::xmm_rm_r(
4229 SseOpcode::Pand,
4230 RegMem::reg(dst.to_reg()),
4231 tmp,
4232 ));
4233 ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
4234
4235 // On overflow 0x80000000 is returned to a lane.
4236 // Below sets positive overflow lanes to 0x7FFFFFFF
4237 // Keeps negative overflow lanes as is.
4238 ctx.emit(Inst::xmm_rm_r(
4239 SseOpcode::Pxor,
4240 RegMem::reg(tmp.to_reg()),
4241 dst,
4242 ));
4243 } else if op == Opcode::FcvtToUintSat {
4244 // The algorithm for converting floats to unsigned ints is a little tricky. The
4245 // complication arises because we are converting from a signed 64-bit int with a positive
4246 // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
4247 // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
4248 // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
4249 // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
4250 // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
4251 // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
4252 // precisely INT_MAX values we can correctly account for and convert every value in this range
4253 // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
4254 // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
4255 // After the conversion we add INT_MAX+1 back to this converted value, noting again that
4256 // values we are trying to account for were already set to INT_MAX+1 during the original conversion.
4257 // We simply have to create a mask and make sure we are adding together only the lanes that need
4258 // to be accounted for. Digesting it all the steps then are:
4259 //
4260 // Step 1 - Account for NaN and negative floats by setting these src values to zero.
4261 // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
4262 // reasons described above.
4263 // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
4264 // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
4265 // values that were originally in the range (0..INT_MAX). This will come in handy during
4266 // step 7 when we zero negative lanes.
4267 // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
4268 // UINT_MAX that are now less than INT_MAX thanks to the subtraction.
4269 // Step 6 - Convert the second set of values (tmp1)
4270 // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
4271 // converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
4272 // as this will allow us to properly saturate overflow lanes when adding to 0x80000000
4273 // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
4274 // than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
4275 // UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
4276 // greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
4277 //
4278 //
4279 // The table below illustrates the result after each step where it matters for the converted set.
4280 // Note the original value range (original src set) is the final dst in Step 8:
4281 //
4282 // Original src set:
4283 // | Original Value Range | Step 1 | Step 3 | Step 8 |
4284 // | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
4285 //
4286 // Copied src set (tmp1):
4287 // | Step 2 | Step 4 |
4288 // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
4289 //
4290 // | Step 6 | Step 7 |
4291 // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
4292
4293 // Create temporaries
4294 assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
4295 let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4296 let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
4297
4298 // Converting to unsigned int so if float src is negative or NaN
4299 // will first set to zero.
4300 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
4301 ctx.emit(Inst::gen_move(dst, src, input_ty));
4302 ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
4303
4304 // Set tmp2 to INT_MAX+1. It is important to note here that after it looks
4305 // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
4306 // single precision IEEE-754 floats can only accurately represent contingous
4307 // integers up to 2^23 and outside of this range it rounds to the closest
4308 // integer that it can represent. In the case of INT_MAX, this value gets
4309 // represented as 0x4f000000 which is the integer value (INT_MAX+1).
4310
4311 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
4312 ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
4313 ctx.emit(Inst::xmm_rm_r(
4314 SseOpcode::Cvtdq2ps,
4315 RegMem::from(tmp2),
4316 tmp2,
4317 ));
4318
4319 // Make a copy of these lanes and then do the first conversion.
4320 // Overflow lanes greater than the maximum allowed signed value will
4321 // set to 0x80000000. Negative and NaN lanes will be 0x0
4322 ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
4323 ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst));
4324
4325 // Set lanes to src - max_signed_int
4326 ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
4327
4328 // Create mask for all positive lanes to saturate (i.e. greater than
4329 // or equal to the maxmimum allowable unsigned int).
4330 let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
4331 ctx.emit(Inst::xmm_rm_r_imm(
4332 SseOpcode::Cmpps,
4333 RegMem::from(tmp1),
4334 tmp2,
4335 cond.encode(),
4336 OperandSize::Size32,
4337 ));
4338
4339 // Convert those set of lanes that have the max_signed_int factored out.
4340 ctx.emit(Inst::xmm_rm_r(
4341 SseOpcode::Cvttps2dq,
4342 RegMem::from(tmp1),
4343 tmp1,
4344 ));
4345
4346 // Prepare converted lanes by zeroing negative lanes and prepping lanes
4347 // that have positive overflow (based on the mask) by setting these lanes
4348 // to 0x7FFFFFFF
4349 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
4350 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
4351 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
4352
4353 // Add this second set of converted lanes to the original to properly handle
4354 // values greater than max signed int.
4355 ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
4356 } else {
4357 // Since this branch is also guarded by a check for vector types
4358 // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
4359 // due to vector varients not existing. The first two branches will
4360 // cover all reachable cases.
4361 unreachable!();
4362 }
4363 }
4364 }
4365 Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
4366 let input_ty = ctx.input_ty(insn, 0);
4367 let output_ty = ctx.output_ty(insn, 0);
4368 let src = put_input_in_reg(ctx, inputs[0]);
4369 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4370 if output_ty.is_vector() {
4371 match op {
4372 Opcode::SwidenLow => match (input_ty, output_ty) {
4373 (types::I8X16, types::I16X8) => {
4374 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(src), dst));
4375 }
4376 (types::I16X8, types::I32X4) => {
4377 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::reg(src), dst));
4378 }
4379 _ => unreachable!(),
4380 },
4381 Opcode::SwidenHigh => match (input_ty, output_ty) {
4382 (types::I8X16, types::I16X8) => {
4383 ctx.emit(Inst::gen_move(dst, src, output_ty));
4384 ctx.emit(Inst::xmm_rm_r_imm(
4385 SseOpcode::Palignr,
4386 RegMem::reg(src),
4387 dst,
4388 8,
4389 OperandSize::Size32,
4390 ));
4391 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
4392 }
4393 (types::I16X8, types::I32X4) => {
4394 ctx.emit(Inst::gen_move(dst, src, output_ty));
4395 ctx.emit(Inst::xmm_rm_r_imm(
4396 SseOpcode::Palignr,
4397 RegMem::reg(src),
4398 dst,
4399 8,
4400 OperandSize::Size32,
4401 ));
4402 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
4403 }
4404 _ => unreachable!(),
4405 },
4406 Opcode::UwidenLow => match (input_ty, output_ty) {
4407 (types::I8X16, types::I16X8) => {
4408 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(src), dst));
4409 }
4410 (types::I16X8, types::I32X4) => {
4411 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
4412 }
4413 _ => unreachable!(),
4414 },
4415 Opcode::UwidenHigh => match (input_ty, output_ty) {
4416 (types::I8X16, types::I16X8) => {
4417 ctx.emit(Inst::gen_move(dst, src, output_ty));
4418 ctx.emit(Inst::xmm_rm_r_imm(
4419 SseOpcode::Palignr,
4420 RegMem::reg(src),
4421 dst,
4422 8,
4423 OperandSize::Size32,
4424 ));
4425 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
4426 }
4427 (types::I16X8, types::I32X4) => {
4428 ctx.emit(Inst::gen_move(dst, src, output_ty));
4429 ctx.emit(Inst::xmm_rm_r_imm(
4430 SseOpcode::Palignr,
4431 RegMem::reg(src),
4432 dst,
4433 8,
4434 OperandSize::Size32,
4435 ));
4436 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
4437 }
4438 _ => unreachable!(),
4439 },
4440 _ => unreachable!(),
4441 }
4442 } else {
4443 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
4444 }
4445 }
4446 Opcode::Snarrow | Opcode::Unarrow => {
4447 let input_ty = ctx.input_ty(insn, 0);
4448 let output_ty = ctx.output_ty(insn, 0);
4449 let src1 = put_input_in_reg(ctx, inputs[0]);
4450 let src2 = put_input_in_reg(ctx, inputs[1]);
4451 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4452 if output_ty.is_vector() {
4453 match op {
4454 Opcode::Snarrow => match (input_ty, output_ty) {
4455 (types::I16X8, types::I8X16) => {
4456 ctx.emit(Inst::gen_move(dst, src1, input_ty));
4457 ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
4458 }
4459 (types::I32X4, types::I16X8) => {
4460 ctx.emit(Inst::gen_move(dst, src1, input_ty));
4461 ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
4462 }
4463 _ => unreachable!(),
4464 },
4465 Opcode::Unarrow => match (input_ty, output_ty) {
4466 (types::I16X8, types::I8X16) => {
4467 ctx.emit(Inst::gen_move(dst, src1, input_ty));
4468 ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
4469 }
4470 (types::I32X4, types::I16X8) => {
4471 ctx.emit(Inst::gen_move(dst, src1, input_ty));
4472 ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
4473 }
4474 _ => unreachable!(),
4475 },
4476 _ => unreachable!(),
4477 }
4478 } else {
4479 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
4480 }
4481 }
4482 Opcode::Bitcast => {
4483 let input_ty = ctx.input_ty(insn, 0);
4484 let output_ty = ctx.output_ty(insn, 0);
4485 match (input_ty, output_ty) {
4486 (types::F32, types::I32) => {
4487 let src = put_input_in_reg(ctx, inputs[0]);
4488 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4489 ctx.emit(Inst::xmm_to_gpr(
4490 SseOpcode::Movd,
4491 src,
4492 dst,
4493 OperandSize::Size32,
4494 ));
4495 }
4496 (types::I32, types::F32) => {
4497 let src = input_to_reg_mem(ctx, inputs[0]);
4498 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4499 ctx.emit(Inst::gpr_to_xmm(
4500 SseOpcode::Movd,
4501 src,
4502 OperandSize::Size32,
4503 dst,
4504 ));
4505 }
4506 (types::F64, types::I64) => {
4507 let src = put_input_in_reg(ctx, inputs[0]);
4508 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4509 ctx.emit(Inst::xmm_to_gpr(
4510 SseOpcode::Movq,
4511 src,
4512 dst,
4513 OperandSize::Size64,
4514 ));
4515 }
4516 (types::I64, types::F64) => {
4517 let src = input_to_reg_mem(ctx, inputs[0]);
4518 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4519 ctx.emit(Inst::gpr_to_xmm(
4520 SseOpcode::Movq,
4521 src,
4522 OperandSize::Size64,
4523 dst,
4524 ));
4525 }
4526 _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
4527 }
4528 }
4529
4530 Opcode::Fabs | Opcode::Fneg => {
4531 let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
4532 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4533
4534 // In both cases, generate a constant and apply a single binary instruction:
4535 // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
4536 // src with it.
4537 // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
4538 // src with it.
4539 let output_ty = ty.unwrap();
4540 if !output_ty.is_vector() {
4541 let (val, opcode): (u64, _) = match output_ty {
4542 types::F32 => match op {
4543 Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
4544 Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
4545 _ => unreachable!(),
4546 },
4547 types::F64 => match op {
4548 Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
4549 Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
4550 _ => unreachable!(),
4551 },
4552 _ => panic!("unexpected type {:?} for Fabs", output_ty),
4553 };
4554
4555 for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| {
4556 ctx.alloc_tmp(ty).only_reg().unwrap()
4557 }) {
4558 ctx.emit(inst);
4559 }
4560
4561 ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
4562 } else {
4563 // Eventually vector constants should be available in `gen_constant` and this block
4564 // can be merged with the one above (TODO).
4565 if output_ty.bits() == 128 {
4566 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
4567 // but ensures that the registers are the same to match x86's read-write operand
4568 // encoding.
4569 let src = put_input_in_reg(ctx, inputs[0]);
4570 ctx.emit(Inst::gen_move(dst, src, output_ty));
4571
4572 // Generate an all 1s constant in an XMM register. This uses CMPPS but could
4573 // have used CMPPD with the same effect. Note, we zero the temp we allocate
4574 // because if not, there is a chance that the register we use could be initialized
4575 // with NaN .. in which case the CMPPS would fail since NaN != NaN.
4576 let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap();
4577 ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorps, RegMem::from(tmp), tmp));
4578 let cond = FcmpImm::from(FloatCC::Equal);
4579 let cmpps = Inst::xmm_rm_r_imm(
4580 SseOpcode::Cmpps,
4581 RegMem::reg(tmp.to_reg()),
4582 tmp,
4583 cond.encode(),
4584 OperandSize::Size32,
4585 );
4586 ctx.emit(cmpps);
4587
4588 // Shift the all 1s constant to generate the mask.
4589 let lane_bits = output_ty.lane_bits();
4590 let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
4591 (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
4592 (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
4593 (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
4594 (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
4595 _ => unreachable!(
4596 "unexpected opcode and lane size: {:?}, {} bits",
4597 op, lane_bits
4598 ),
4599 };
4600 let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
4601 ctx.emit(shift);
4602
4603 // Apply shifted mask (XOR or AND).
4604 let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
4605 ctx.emit(mask);
4606 } else {
4607 panic!("unexpected type {:?} for Fabs", output_ty);
4608 }
4609 }
4610 }
4611
4612 Opcode::Fcopysign => {
4613 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4614 let lhs = put_input_in_reg(ctx, inputs[0]);
4615 let rhs = put_input_in_reg(ctx, inputs[1]);
4616
4617 let ty = ty.unwrap();
4618
4619 // We're going to generate the following sequence:
4620 //
4621 // movabs $INT_MIN, tmp_gpr1
4622 // mov{d,q} tmp_gpr1, tmp_xmm1
4623 // movap{s,d} tmp_xmm1, dst
4624 // andnp{s,d} src_1, dst
4625 // movap{s,d} src_2, tmp_xmm2
4626 // andp{s,d} tmp_xmm1, tmp_xmm2
4627 // orp{s,d} tmp_xmm2, dst
4628
4629 let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
4630 let tmp_xmm2 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
4631
4632 let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
4633 types::F32 => (
4634 0x8000_0000,
4635 SseOpcode::Movaps,
4636 SseOpcode::Andnps,
4637 SseOpcode::Andps,
4638 SseOpcode::Orps,
4639 ),
4640 types::F64 => (
4641 0x8000_0000_0000_0000,
4642 SseOpcode::Movapd,
4643 SseOpcode::Andnpd,
4644 SseOpcode::Andpd,
4645 SseOpcode::Orpd,
4646 ),
4647 _ => {
4648 panic!("unexpected type {:?} for copysign", ty);
4649 }
4650 };
4651
4652 for inst in Inst::gen_constant(ValueRegs::one(tmp_xmm1), sign_bit_cst, ty, |ty| {
4653 ctx.alloc_tmp(ty).only_reg().unwrap()
4654 }) {
4655 ctx.emit(inst);
4656 }
4657 ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
4658 ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
4659 ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2));
4660 ctx.emit(Inst::xmm_rm_r(
4661 and_op,
4662 RegMem::reg(tmp_xmm1.to_reg()),
4663 tmp_xmm2,
4664 ));
4665 ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
4666 }
4667
4668 Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
4669 let ty = ty.unwrap();
4670 if isa_flags.use_sse41() {
4671 let mode = match op {
4672 Opcode::Ceil => RoundImm::RoundUp,
4673 Opcode::Floor => RoundImm::RoundDown,
4674 Opcode::Nearest => RoundImm::RoundNearest,
4675 Opcode::Trunc => RoundImm::RoundZero,
4676 _ => panic!("unexpected opcode {:?} in Ceil/Floor/Nearest/Trunc", op),
4677 };
4678 let op = match ty {
4679 types::F32 => SseOpcode::Roundss,
4680 types::F64 => SseOpcode::Roundsd,
4681 types::F32X4 => SseOpcode::Roundps,
4682 types::F64X2 => SseOpcode::Roundpd,
4683 _ => panic!("unexpected type {:?} in Ceil/Floor/Nearest/Trunc", ty),
4684 };
4685 let src = input_to_reg_mem(ctx, inputs[0]);
4686 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4687 ctx.emit(Inst::xmm_rm_r_imm(
4688 op,
4689 src,
4690 dst,
4691 mode.encode(),
4692 OperandSize::Size32,
4693 ));
4694 } else {
4695 // Lower to VM calls when there's no access to SSE4.1.
4696 // Note, for vector types on platforms that don't support sse41
4697 // the execution will panic here.
4698 let libcall = match (op, ty) {
4699 (Opcode::Ceil, types::F32) => LibCall::CeilF32,
4700 (Opcode::Ceil, types::F64) => LibCall::CeilF64,
4701 (Opcode::Floor, types::F32) => LibCall::FloorF32,
4702 (Opcode::Floor, types::F64) => LibCall::FloorF64,
4703 (Opcode::Nearest, types::F32) => LibCall::NearestF32,
4704 (Opcode::Nearest, types::F64) => LibCall::NearestF64,
4705 (Opcode::Trunc, types::F32) => LibCall::TruncF32,
4706 (Opcode::Trunc, types::F64) => LibCall::TruncF64,
4707 _ => panic!(
4708 "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
4709 ty, op
4710 ),
4711 };
4712 emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
4713 }
4714 }
4715
4716 Opcode::Load
4717 | Opcode::Uload8
4718 | Opcode::Sload8
4719 | Opcode::Uload16
4720 | Opcode::Sload16
4721 | Opcode::Uload32
4722 | Opcode::Sload32
4723 | Opcode::LoadComplex
4724 | Opcode::Uload8Complex
4725 | Opcode::Sload8Complex
4726 | Opcode::Uload16Complex
4727 | Opcode::Sload16Complex
4728 | Opcode::Uload32Complex
4729 | Opcode::Sload32Complex
4730 | Opcode::Sload8x8
4731 | Opcode::Uload8x8
4732 | Opcode::Sload16x4
4733 | Opcode::Uload16x4
4734 | Opcode::Sload32x2
4735 | Opcode::Uload32x2 => {
4736 let offset = ctx.data(insn).load_store_offset().unwrap();
4737
4738 let elem_ty = match op {
4739 Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
4740 types::I8
4741 }
4742 Opcode::Sload16
4743 | Opcode::Uload16
4744 | Opcode::Sload16Complex
4745 | Opcode::Uload16Complex => types::I16,
4746 Opcode::Sload32
4747 | Opcode::Uload32
4748 | Opcode::Sload32Complex
4749 | Opcode::Uload32Complex => types::I32,
4750 Opcode::Sload8x8
4751 | Opcode::Uload8x8
4752 | Opcode::Sload8x8Complex
4753 | Opcode::Uload8x8Complex => types::I8X8,
4754 Opcode::Sload16x4
4755 | Opcode::Uload16x4
4756 | Opcode::Sload16x4Complex
4757 | Opcode::Uload16x4Complex => types::I16X4,
4758 Opcode::Sload32x2
4759 | Opcode::Uload32x2
4760 | Opcode::Sload32x2Complex
4761 | Opcode::Uload32x2Complex => types::I32X2,
4762 Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
4763 _ => unimplemented!(),
4764 };
4765
4766 let ext_mode = ExtMode::new(elem_ty.bits(), 64);
4767
4768 let sign_extend = match op {
4769 Opcode::Sload8
4770 | Opcode::Sload8Complex
4771 | Opcode::Sload16
4772 | Opcode::Sload16Complex
4773 | Opcode::Sload32
4774 | Opcode::Sload32Complex
4775 | Opcode::Sload8x8
4776 | Opcode::Sload8x8Complex
4777 | Opcode::Sload16x4
4778 | Opcode::Sload16x4Complex
4779 | Opcode::Sload32x2
4780 | Opcode::Sload32x2Complex => true,
4781 _ => false,
4782 };
4783
4784 let amode = match op {
4785 Opcode::Load
4786 | Opcode::Uload8
4787 | Opcode::Sload8
4788 | Opcode::Uload16
4789 | Opcode::Sload16
4790 | Opcode::Uload32
4791 | Opcode::Sload32
4792 | Opcode::Sload8x8
4793 | Opcode::Uload8x8
4794 | Opcode::Sload16x4
4795 | Opcode::Uload16x4
4796 | Opcode::Sload32x2
4797 | Opcode::Uload32x2 => {
4798 assert_eq!(inputs.len(), 1, "only one input for load operands");
4799 lower_to_amode(ctx, inputs[0], offset)
4800 }
4801
4802 Opcode::LoadComplex
4803 | Opcode::Uload8Complex
4804 | Opcode::Sload8Complex
4805 | Opcode::Uload16Complex
4806 | Opcode::Sload16Complex
4807 | Opcode::Uload32Complex
4808 | Opcode::Sload32Complex
4809 | Opcode::Sload8x8Complex
4810 | Opcode::Uload8x8Complex
4811 | Opcode::Sload16x4Complex
4812 | Opcode::Uload16x4Complex
4813 | Opcode::Sload32x2Complex
4814 | Opcode::Uload32x2Complex => {
4815 assert_eq!(
4816 inputs.len(),
4817 2,
4818 "can't handle more than two inputs in complex load"
4819 );
4820 let base = put_input_in_reg(ctx, inputs[0]);
4821 let index = put_input_in_reg(ctx, inputs[1]);
4822 let shift = 0;
4823 let flags = ctx.memflags(insn).expect("load should have memflags");
4824 Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
4825 }
4826 _ => unreachable!(),
4827 };
4828
4829 if elem_ty == types::I128 {
4830 let dsts = get_output_reg(ctx, outputs[0]);
4831 ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
4832 ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
4833 } else {
4834 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4835 let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
4836 match (sign_extend, is_xmm) {
4837 (true, false) => {
4838 // The load is sign-extended only when the output size is lower than 64 bits,
4839 // so ext-mode is defined in this case.
4840 ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
4841 }
4842 (false, false) => {
4843 if elem_ty.bytes() == 8 {
4844 // Use a plain load.
4845 ctx.emit(Inst::mov64_m_r(amode, dst))
4846 } else {
4847 // Use a zero-extended load.
4848 ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
4849 }
4850 }
4851 (_, true) => {
4852 ctx.emit(match elem_ty {
4853 types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
4854 types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
4855 types::I8X8 => {
4856 if sign_extend == true {
4857 Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
4858 } else {
4859 Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
4860 }
4861 }
4862 types::I16X4 => {
4863 if sign_extend == true {
4864 Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
4865 } else {
4866 Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
4867 }
4868 }
4869 types::I32X2 => {
4870 if sign_extend == true {
4871 Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
4872 } else {
4873 Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
4874 }
4875 }
4876 _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
4877 Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
4878 }
4879 // TODO Specialize for different types: MOVUPD, MOVDQU
4880 _ => unreachable!(
4881 "unexpected type for load: {:?} - {:?}",
4882 elem_ty,
4883 elem_ty.bits()
4884 ),
4885 });
4886 }
4887 }
4888 }
4889 }
4890
4891 Opcode::Store
4892 | Opcode::Istore8
4893 | Opcode::Istore16
4894 | Opcode::Istore32
4895 | Opcode::StoreComplex
4896 | Opcode::Istore8Complex
4897 | Opcode::Istore16Complex
4898 | Opcode::Istore32Complex => {
4899 let offset = ctx.data(insn).load_store_offset().unwrap();
4900
4901 let elem_ty = match op {
4902 Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
4903 Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
4904 Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
4905 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
4906 _ => unreachable!(),
4907 };
4908
4909 let addr = match op {
4910 Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
4911 assert_eq!(inputs.len(), 2, "only one input for store memory operands");
4912 lower_to_amode(ctx, inputs[1], offset)
4913 }
4914
4915 Opcode::StoreComplex
4916 | Opcode::Istore8Complex
4917 | Opcode::Istore16Complex
4918 | Opcode::Istore32Complex => {
4919 assert_eq!(
4920 inputs.len(),
4921 3,
4922 "can't handle more than two inputs in complex store"
4923 );
4924 let base = put_input_in_reg(ctx, inputs[1]);
4925 let index = put_input_in_reg(ctx, inputs[2]);
4926 let shift = 0;
4927 let flags = ctx.memflags(insn).expect("store should have memflags");
4928 Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
4929 }
4930
4931 _ => unreachable!(),
4932 };
4933
4934 if elem_ty == types::I128 {
4935 let srcs = put_input_in_regs(ctx, inputs[0]);
4936 ctx.emit(Inst::store(types::I64, srcs.regs()[0], addr.clone()));
4937 ctx.emit(Inst::store(types::I64, srcs.regs()[1], addr.offset(8)));
4938 } else {
4939 let src = put_input_in_reg(ctx, inputs[0]);
4940 ctx.emit(Inst::store(elem_ty, src, addr));
4941 }
4942 }
4943
4944 Opcode::AtomicRmw => {
4945 // This is a simple, general-case atomic update, based on a loop involving
4946 // `cmpxchg`. Note that we could do much better than this in the case where the old
4947 // value at the location (that is to say, the SSA `Value` computed by this CLIF
4948 // instruction) is not required. In that case, we could instead implement this
4949 // using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in
4950 // the case where the old value is required, for the `add` and `sub` cases, we can
4951 // use the single instruction `lock xadd`. However, those improvements have been
4952 // left for another day.
4953 // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
4954 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4955 let mut addr = put_input_in_reg(ctx, inputs[0]);
4956 let mut arg2 = put_input_in_reg(ctx, inputs[1]);
4957 let ty_access = ty.unwrap();
4958 assert!(is_valid_atomic_transaction_ty(ty_access));
4959
4960 // Make sure that both args are in virtual regs, since in effect we have to do a
4961 // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
4962 // guaranteed safe if either is in a real reg.
4963 addr = ctx.ensure_in_vreg(addr, types::I64);
4964 arg2 = ctx.ensure_in_vreg(arg2, types::I64);
4965
4966 // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq`
4967 // operates at whatever width is specified by `ty`, so there's no need to
4968 // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
4969 ctx.emit(Inst::gen_move(
4970 Writable::from_reg(regs::r9()),
4971 addr,
4972 types::I64,
4973 ));
4974 ctx.emit(Inst::gen_move(
4975 Writable::from_reg(regs::r10()),
4976 arg2,
4977 types::I64,
4978 ));
4979
4980 // Now the AtomicRmwSeq (pseudo-) instruction itself
4981 let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
4982 ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op });
4983
4984 // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
4985 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
4986 }
4987
4988 Opcode::AtomicCas => {
4989 // This is very similar to, but not identical to, the `AtomicRmw` case. As with
4990 // `AtomicRmw`, there's no need to zero-extend narrow values here.
4991 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
4992 let addr = lower_to_amode(ctx, inputs[0], 0);
4993 let expected = put_input_in_reg(ctx, inputs[1]);
4994 let replacement = put_input_in_reg(ctx, inputs[2]);
4995 let ty_access = ty.unwrap();
4996 assert!(is_valid_atomic_transaction_ty(ty_access));
4997
4998 // Move the expected value into %rax. Because there's only one fixed register on
4999 // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
5000 // `AtomicRmw` case.
5001 ctx.emit(Inst::gen_move(
5002 Writable::from_reg(regs::rax()),
5003 expected,
5004 types::I64,
5005 ));
5006 ctx.emit(Inst::LockCmpxchg {
5007 ty: ty_access,
5008 src: replacement,
5009 dst: addr.into(),
5010 });
5011 // And finally, copy the old value at the location to its destination reg.
5012 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5013 }
5014
5015 Opcode::AtomicLoad => {
5016 // This is a normal load. The x86-TSO memory model provides sufficient sequencing
5017 // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
5018 // need for any fence instructions.
5019 let data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5020 let addr = lower_to_amode(ctx, inputs[0], 0);
5021 let ty_access = ty.unwrap();
5022 assert!(is_valid_atomic_transaction_ty(ty_access));
5023
5024 let rm = RegMem::mem(addr);
5025 if ty_access == types::I64 {
5026 ctx.emit(Inst::mov64_rm_r(rm, data));
5027 } else {
5028 let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!(
5029 "invalid extension during AtomicLoad: {} -> {}",
5030 ty_access.bits(),
5031 64
5032 ));
5033 ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data));
5034 }
5035 }
5036
5037 Opcode::AtomicStore => {
5038 // This is a normal store, followed by an `mfence` instruction.
5039 let data = put_input_in_reg(ctx, inputs[0]);
5040 let addr = lower_to_amode(ctx, inputs[1], 0);
5041 let ty_access = ctx.input_ty(insn, 0);
5042 assert!(is_valid_atomic_transaction_ty(ty_access));
5043
5044 ctx.emit(Inst::store(ty_access, data, addr));
5045 ctx.emit(Inst::Fence {
5046 kind: FenceKind::MFence,
5047 });
5048 }
5049
5050 Opcode::Fence => {
5051 ctx.emit(Inst::Fence {
5052 kind: FenceKind::MFence,
5053 });
5054 }
5055
5056 Opcode::FuncAddr => {
5057 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5058 let (extname, _) = ctx.call_target(insn).unwrap();
5059 let extname = extname.clone();
5060 ctx.emit(Inst::LoadExtName {
5061 dst,
5062 name: Box::new(extname),
5063 offset: 0,
5064 });
5065 }
5066
5067 Opcode::SymbolValue => {
5068 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5069 let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
5070 let extname = extname.clone();
5071 ctx.emit(Inst::LoadExtName {
5072 dst,
5073 name: Box::new(extname),
5074 offset,
5075 });
5076 }
5077
5078 Opcode::StackAddr => {
5079 let (stack_slot, offset) = match *ctx.data(insn) {
5080 InstructionData::StackLoad {
5081 opcode: Opcode::StackAddr,
5082 stack_slot,
5083 offset,
5084 } => (stack_slot, offset),
5085 _ => unreachable!(),
5086 };
5087 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5088 let offset: i32 = offset.into();
5089 let inst = ctx
5090 .abi()
5091 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
5092 ctx.emit(inst);
5093 }
5094
5095 Opcode::Select => {
5096 let flag_input = inputs[0];
5097 if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
5098 let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
5099
5100 // For equal, we flip the operands, because we can't test a conjunction of
5101 // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
5102 let (lhs_input, rhs_input) = match cond_code {
5103 FloatCC::Equal => (inputs[2], inputs[1]),
5104 _ => (inputs[1], inputs[2]),
5105 };
5106
5107 let ty = ctx.output_ty(insn, 0);
5108 let rhs = put_input_in_regs(ctx, rhs_input);
5109 let dst = get_output_reg(ctx, outputs[0]);
5110 let lhs = put_input_in_regs(ctx, lhs_input);
5111
5112 // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
5113 // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
5114 // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
5115 // select operation, and invert the equal to a not-equal here.
5116 let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
5117
5118 if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
5119 // Keep this sync'd with the lowering of the select inputs above.
5120 assert_eq!(cond_code, FloatCC::Equal);
5121 }
5122
5123 emit_moves(ctx, dst, rhs, ty);
5124
5125 let operand_size = if ty == types::F64 {
5126 OperandSize::Size64
5127 } else {
5128 OperandSize::Size32
5129 };
5130 match fcmp_results {
5131 FcmpCondResult::Condition(cc) => {
5132 if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 {
5133 let size = ty.bytes() as u8;
5134 emit_cmoves(ctx, size, cc, lhs, dst);
5135 } else {
5136 ctx.emit(Inst::xmm_cmove(
5137 operand_size,
5138 cc,
5139 RegMem::reg(lhs.only_reg().unwrap()),
5140 dst.only_reg().unwrap(),
5141 ));
5142 }
5143 }
5144 FcmpCondResult::AndConditions(_, _) => {
5145 unreachable!(
5146 "can't AND with select; see above comment about inverting equal"
5147 );
5148 }
5149 FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
5150 | FcmpCondResult::OrConditions(cc1, cc2) => {
5151 if is_int_or_ref_ty(ty) || ty == types::I128 {
5152 let size = ty.bytes() as u8;
5153 emit_cmoves(ctx, size, cc1, lhs.clone(), dst);
5154 emit_cmoves(ctx, size, cc2, lhs, dst);
5155 } else {
5156 ctx.emit(Inst::xmm_cmove(
5157 operand_size,
5158 cc1,
5159 RegMem::reg(lhs.only_reg().unwrap()),
5160 dst.only_reg().unwrap(),
5161 ));
5162 ctx.emit(Inst::xmm_cmove(
5163 operand_size,
5164 cc2,
5165 RegMem::reg(lhs.only_reg().unwrap()),
5166 dst.only_reg().unwrap(),
5167 ));
5168 }
5169 }
5170 }
5171 } else {
5172 let ty = ty.unwrap();
5173
5174 let size = ty.bytes() as u8;
5175 let lhs = put_input_in_regs(ctx, inputs[1]);
5176 let rhs = put_input_in_regs(ctx, inputs[2]);
5177 let dst = get_output_reg(ctx, outputs[0]);
5178
5179 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
5180 let cond_code = ctx.data(icmp).cond_code().unwrap();
5181 let cond_code = emit_cmp(ctx, icmp, cond_code);
5182 CC::from_intcc(cond_code)
5183 } else {
5184 let sel_ty = ctx.input_ty(insn, 0);
5185 let size = OperandSize::from_ty(ctx.input_ty(insn, 0));
5186 let test = put_input_in_reg(ctx, flag_input);
5187 let test_input = if sel_ty == types::B1 {
5188 // The input is a boolean value; test the LSB for nonzero with:
5189 // test reg, 1
5190 RegMemImm::imm(1)
5191 } else {
5192 // The input is an integer; test the whole value for
5193 // nonzero with:
5194 // test reg, reg
5195 //
5196 // (It doesn't make sense to have a boolean wider than
5197 // one bit here -- which bit would cause us to select an
5198 // input?)
5199 assert!(!is_bool_ty(sel_ty));
5200 RegMemImm::reg(test)
5201 };
5202 ctx.emit(Inst::test_rmi_r(size, test_input, test));
5203 CC::NZ
5204 };
5205
5206 // This doesn't affect the flags.
5207 emit_moves(ctx, dst, rhs, ty);
5208
5209 if is_int_or_ref_ty(ty) || ty == types::I128 {
5210 emit_cmoves(ctx, size, cc, lhs, dst);
5211 } else {
5212 debug_assert!(ty == types::F32 || ty == types::F64);
5213 ctx.emit(Inst::xmm_cmove(
5214 if ty == types::F64 {
5215 OperandSize::Size64
5216 } else {
5217 OperandSize::Size32
5218 },
5219 cc,
5220 RegMem::reg(lhs.only_reg().unwrap()),
5221 dst.only_reg().unwrap(),
5222 ));
5223 }
5224 }
5225 }
5226
5227 Opcode::Selectif | Opcode::SelectifSpectreGuard => {
5228 let lhs = put_input_in_regs(ctx, inputs[1]);
5229 let rhs = put_input_in_regs(ctx, inputs[2]);
5230 let dst = get_output_reg(ctx, outputs[0]);
5231 let ty = ctx.output_ty(insn, 0);
5232
5233 // Verification ensures that the input is always a single-def ifcmp.
5234 let cmp_insn = ctx
5235 .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
5236 .inst
5237 .unwrap()
5238 .0;
5239 debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
5240 let cond_code = ctx.data(insn).cond_code().unwrap();
5241 let cond_code = emit_cmp(ctx, cmp_insn, cond_code);
5242
5243 let cc = CC::from_intcc(cond_code);
5244
5245 if is_int_or_ref_ty(ty) || ty == types::I128 {
5246 let size = ty.bytes() as u8;
5247 emit_moves(ctx, dst, rhs, ty);
5248 emit_cmoves(ctx, size, cc, lhs, dst);
5249 } else {
5250 debug_assert!(ty == types::F32 || ty == types::F64);
5251 emit_moves(ctx, dst, rhs, ty);
5252 ctx.emit(Inst::xmm_cmove(
5253 if ty == types::F64 {
5254 OperandSize::Size64
5255 } else {
5256 OperandSize::Size32
5257 },
5258 cc,
5259 RegMem::reg(lhs.only_reg().unwrap()),
5260 dst.only_reg().unwrap(),
5261 ));
5262 }
5263 }
5264
5265 Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
5266 let kind = match op {
5267 Opcode::Udiv => DivOrRemKind::UnsignedDiv,
5268 Opcode::Sdiv => DivOrRemKind::SignedDiv,
5269 Opcode::Urem => DivOrRemKind::UnsignedRem,
5270 Opcode::Srem => DivOrRemKind::SignedRem,
5271 _ => unreachable!(),
5272 };
5273 let is_div = kind.is_div();
5274
5275 let input_ty = ctx.input_ty(insn, 0);
5276 let size = OperandSize::from_ty(input_ty);
5277
5278 let dividend = put_input_in_reg(ctx, inputs[0]);
5279 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5280
5281 ctx.emit(Inst::gen_move(
5282 Writable::from_reg(regs::rax()),
5283 dividend,
5284 input_ty,
5285 ));
5286
5287 // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
5288 if flags.avoid_div_traps() || op == Opcode::Srem {
5289 // A vcode meta-instruction is used to lower the inline checks, since they embed
5290 // pc-relative offsets that must not change, thus requiring regalloc to not
5291 // interfere by introducing spills and reloads.
5292 //
5293 // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
5294 // regalloc is aware of the coalescing opportunity between rax/rdx and the
5295 // destination register.
5296 let divisor = put_input_in_reg(ctx, inputs[1]);
5297
5298 let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5299 ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
5300
5301 let tmp = if op == Opcode::Sdiv && size == OperandSize::Size64 {
5302 Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
5303 } else {
5304 None
5305 };
5306 // TODO use xor
5307 ctx.emit(Inst::imm(
5308 OperandSize::Size32,
5309 0,
5310 Writable::from_reg(regs::rdx()),
5311 ));
5312 ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
5313 } else {
5314 // We don't want more than one trap record for a single instruction,
5315 // so let's not allow the "mem" case (load-op merging) here; force
5316 // divisor into a register instead.
5317 let divisor = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
5318
5319 // Fill in the high parts:
5320 if kind.is_signed() {
5321 // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
5322 // signed opcodes.
5323 ctx.emit(Inst::sign_extend_data(size));
5324 } else if input_ty == types::I8 {
5325 ctx.emit(Inst::movzx_rm_r(
5326 ExtMode::BL,
5327 RegMem::reg(regs::rax()),
5328 Writable::from_reg(regs::rax()),
5329 ));
5330 } else {
5331 // zero for unsigned opcodes.
5332 ctx.emit(Inst::imm(
5333 OperandSize::Size64,
5334 0,
5335 Writable::from_reg(regs::rdx()),
5336 ));
5337 }
5338
5339 // Emit the actual idiv.
5340 ctx.emit(Inst::div(size, kind.is_signed(), divisor));
5341 }
5342
5343 // Move the result back into the destination reg.
5344 if is_div {
5345 // The quotient is in rax.
5346 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
5347 } else {
5348 if size == OperandSize::Size8 {
5349 // The remainder is in AH. Right-shift by 8 bits then move from rax.
5350 ctx.emit(Inst::shift_r(
5351 OperandSize::Size64,
5352 ShiftKind::ShiftRightLogical,
5353 Some(8),
5354 Writable::from_reg(regs::rax()),
5355 ));
5356 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
5357 } else {
5358 // The remainder is in rdx.
5359 ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
5360 }
5361 }
5362 }
5363
5364 Opcode::Umulhi | Opcode::Smulhi => {
5365 let input_ty = ctx.input_ty(insn, 0);
5366
5367 let lhs = put_input_in_reg(ctx, inputs[0]);
5368 let rhs = input_to_reg_mem(ctx, inputs[1]);
5369 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5370
5371 // Move lhs in %rax.
5372 ctx.emit(Inst::gen_move(
5373 Writable::from_reg(regs::rax()),
5374 lhs,
5375 input_ty,
5376 ));
5377
5378 // Emit the actual mul or imul.
5379 let signed = op == Opcode::Smulhi;
5380 ctx.emit(Inst::mul_hi(OperandSize::from_ty(input_ty), signed, rhs));
5381
5382 // Read the result from the high part (stored in %rdx).
5383 ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
5384 }
5385
5386 Opcode::GetPinnedReg => {
5387 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5388 ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
5389 }
5390
5391 Opcode::SetPinnedReg => {
5392 let src = put_input_in_reg(ctx, inputs[0]);
5393 ctx.emit(Inst::gen_move(
5394 Writable::from_reg(regs::pinned_reg()),
5395 src,
5396 types::I64,
5397 ));
5398 }
5399
5400 Opcode::Vconst => {
5401 let used_constant = if let &InstructionData::UnaryConst {
5402 constant_handle, ..
5403 } = ctx.data(insn)
5404 {
5405 ctx.use_constant(VCodeConstantData::Pool(
5406 constant_handle,
5407 ctx.get_constant_data(constant_handle).clone(),
5408 ))
5409 } else {
5410 unreachable!("vconst should always have unary_const format")
5411 };
5412 // TODO use Inst::gen_constant() instead.
5413 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5414 let ty = ty.unwrap();
5415 ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
5416 }
5417
5418 Opcode::RawBitcast => {
5419 // A raw_bitcast is just a mechanism for correcting the type of V128 values (see
5420 // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
5421 // instruction should emit no machine code but a move is necessary to give the register
5422 // allocator a definition for the output virtual register.
5423 let src = put_input_in_reg(ctx, inputs[0]);
5424 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5425 let ty = ty.unwrap();
5426 ctx.emit(Inst::gen_move(dst, src, ty));
5427 }
5428
5429 Opcode::Shuffle => {
5430 let ty = ty.unwrap();
5431 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5432 let lhs_ty = ctx.input_ty(insn, 0);
5433 let lhs = put_input_in_reg(ctx, inputs[0]);
5434 let rhs = put_input_in_reg(ctx, inputs[1]);
5435 let mask = match ctx.get_immediate(insn) {
5436 Some(DataValue::V128(bytes)) => bytes.to_vec(),
5437 _ => unreachable!("shuffle should always have a 16-byte immediate"),
5438 };
5439
5440 // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
5441 // 1 in the most significant position zeroes the lane.
5442 let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
5443
5444 ctx.emit(Inst::gen_move(dst, rhs, ty));
5445 if rhs == lhs {
5446 // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
5447 // register. We statically build `constructed_mask` to zero out any unknown lane
5448 // indices (may not be completely necessary: verification could fail incorrect mask
5449 // values) and fix the indexes to all point to the `dst` vector.
5450 let constructed_mask = mask
5451 .iter()
5452 // If the mask is greater than 15 it still may be referring to a lane in b.
5453 .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
5454 .map(zero_unknown_lane_index)
5455 .collect();
5456 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5457 let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5458 ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
5459 // After loading the constructed mask in a temporary register, we use this to
5460 // shuffle the `dst` register (remember that, in this case, it is the same as
5461 // `src` so we disregard this register).
5462 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
5463 } else {
5464 // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
5465 // them together. This is necessary due to PSHUFB semantics. As in the case above,
5466 // we build the `constructed_mask` for each case statically.
5467
5468 // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
5469 let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
5470 ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
5471 let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
5472 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5473 let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5474 ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
5475 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
5476
5477 // PSHUFB the second argument, placing zeroes for unused lanes.
5478 let constructed_mask = mask
5479 .iter()
5480 .map(|b| b.wrapping_sub(16))
5481 .map(zero_unknown_lane_index)
5482 .collect();
5483 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
5484 let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5485 ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
5486 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
5487
5488 // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
5489 // is not important).
5490 ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
5491
5492 // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
5493 }
5494 }
5495
5496 Opcode::Swizzle => {
5497 // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
5498 // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
5499 // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
5500 // semantics match the Wasm SIMD semantics for this instruction.
5501 // The instruction format maps to variables like: %dst = swizzle %src, %mask
5502 let ty = ty.unwrap();
5503 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5504 let src = put_input_in_reg(ctx, inputs[0]);
5505 let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
5506
5507 // Inform the register allocator that `src` and `dst` should be in the same register.
5508 ctx.emit(Inst::gen_move(dst, src, ty));
5509
5510 // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
5511 let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
5512 static ZERO_MASK_VALUE: [u8; 16] = [
5513 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
5514 0x70, 0x70,
5515 ];
5516 let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
5517 ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
5518
5519 // Use the `zero_mask` on a writable `swizzle_mask`.
5520 let swizzle_mask = Writable::from_reg(swizzle_mask);
5521 ctx.emit(Inst::xmm_rm_r(
5522 SseOpcode::Paddusb,
5523 RegMem::from(zero_mask),
5524 swizzle_mask,
5525 ));
5526
5527 // Shuffle `dst` using the fixed-up `swizzle_mask`.
5528 ctx.emit(Inst::xmm_rm_r(
5529 SseOpcode::Pshufb,
5530 RegMem::from(swizzle_mask),
5531 dst,
5532 ));
5533 }
5534
5535 Opcode::Insertlane => {
5536 // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
5537 let ty = ty.unwrap();
5538 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5539 let in_vec = put_input_in_reg(ctx, inputs[0]);
5540 let src_ty = ctx.input_ty(insn, 1);
5541 debug_assert!(!src_ty.is_vector());
5542 let src = input_to_reg_mem(ctx, inputs[1]);
5543 let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
5544 *imm
5545 } else {
5546 unreachable!();
5547 };
5548 debug_assert!(lane < ty.lane_count() as u8);
5549
5550 ctx.emit(Inst::gen_move(dst, in_vec, ty));
5551 emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
5552 }
5553
5554 Opcode::Extractlane => {
5555 // The instruction format maps to variables like: %dst = extractlane %src, %lane
5556 let ty = ty.unwrap();
5557 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5558 let src_ty = ctx.input_ty(insn, 0);
5559 assert_eq!(src_ty.bits(), 128);
5560 let src = put_input_in_reg(ctx, inputs[0]);
5561 let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
5562 *imm
5563 } else {
5564 unreachable!();
5565 };
5566 debug_assert!(lane < src_ty.lane_count() as u8);
5567
5568 emit_extract_lane(ctx, src, dst, lane, ty);
5569 }
5570
5571 Opcode::ScalarToVector => {
5572 // When moving a scalar value to a vector register, we must be handle several
5573 // situations:
5574 // 1. a scalar float is already in an XMM register, so we simply move it
5575 // 2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
5576 // XMM register and zeroes the upper bits
5577 // 3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
5578 // the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
5579 // MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
5580 // unused load.
5581 let src = input_to_reg_mem(ctx, inputs[0]);
5582 let src_ty = ctx.input_ty(insn, 0);
5583 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5584 let dst_ty = ty.unwrap();
5585 assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
5586 match src {
5587 RegMem::Reg { reg } => {
5588 if src_ty.is_float() {
5589 // Case 1: when moving a scalar float, we simply move from one XMM register
5590 // to another, expecting the register allocator to elide this. Here we
5591 // assume that the upper bits of a scalar float have not been munged with
5592 // (the same assumption the old backend makes).
5593 ctx.emit(Inst::gen_move(dst, reg, dst_ty));
5594 } else {
5595 // Case 2: when moving a scalar value of any other type, use MOVD to zero
5596 // the upper lanes.
5597 let src_size = match src_ty.bits() {
5598 32 => OperandSize::Size32,
5599 64 => OperandSize::Size64,
5600 _ => unimplemented!("invalid source size for type: {}", src_ty),
5601 };
5602 ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
5603 }
5604 }
5605 RegMem::Mem { .. } => {
5606 // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
5607 // MOVSS/MOVSD instruction.
5608 let opcode = match src_ty.bits() {
5609 32 => SseOpcode::Movss,
5610 64 => SseOpcode::Movsd,
5611 _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
5612 };
5613 ctx.emit(Inst::xmm_mov(opcode, src, dst));
5614 }
5615 }
5616 }
5617
5618 Opcode::Splat => {
5619 let ty = ty.unwrap();
5620 assert_eq!(ty.bits(), 128);
5621 let src_ty = ctx.input_ty(insn, 0);
5622 assert!(src_ty.bits() < 128);
5623
5624 let src = input_to_reg_mem(ctx, inputs[0]);
5625 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5626
5627 // We know that splat will overwrite all of the lanes of `dst` but it takes several
5628 // instructions to do so. Because of the multiple instructions, there is no good way to
5629 // declare `dst` a `def` except with the following pseudo-instruction.
5630 ctx.emit(Inst::xmm_uninit_value(dst));
5631
5632 // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
5633 // and VPBROADCAST*.
5634 match ty.lane_bits() {
5635 8 => {
5636 emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
5637 // Initialize a register with all 0s.
5638 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
5639 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
5640 // Shuffle the lowest byte lane to all other lanes.
5641 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
5642 }
5643 16 => {
5644 emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
5645 emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
5646 // Shuffle the lowest two lanes to all other lanes.
5647 ctx.emit(Inst::xmm_rm_r_imm(
5648 SseOpcode::Pshufd,
5649 RegMem::from(dst),
5650 dst,
5651 0,
5652 OperandSize::Size32,
5653 ))
5654 }
5655 32 => {
5656 emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
5657 // Shuffle the lowest lane to all other lanes.
5658 ctx.emit(Inst::xmm_rm_r_imm(
5659 SseOpcode::Pshufd,
5660 RegMem::from(dst),
5661 dst,
5662 0,
5663 OperandSize::Size32,
5664 ))
5665 }
5666 64 => {
5667 emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
5668 emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
5669 }
5670 _ => panic!("Invalid type to splat: {}", ty),
5671 }
5672 }
5673
5674 Opcode::VanyTrue => {
5675 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5676 let src_ty = ctx.input_ty(insn, 0);
5677 assert_eq!(src_ty.bits(), 128);
5678 let src = put_input_in_reg(ctx, inputs[0]);
5679 // Set the ZF if the result is all zeroes.
5680 ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
5681 // If the ZF is not set, place a 1 in `dst`.
5682 ctx.emit(Inst::setcc(CC::NZ, dst));
5683 }
5684
5685 Opcode::VallTrue => {
5686 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5687 let src_ty = ctx.input_ty(insn, 0);
5688 assert_eq!(src_ty.bits(), 128);
5689 let src = input_to_reg_mem(ctx, inputs[0]);
5690
5691 let eq = |ty: Type| match ty.lane_bits() {
5692 8 => SseOpcode::Pcmpeqb,
5693 16 => SseOpcode::Pcmpeqw,
5694 32 => SseOpcode::Pcmpeqd,
5695 64 => SseOpcode::Pcmpeqq,
5696 _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
5697 };
5698
5699 // Initialize a register with all 0s.
5700 let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
5701 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
5702 // Compare to see what lanes are filled with all 1s.
5703 ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
5704 // Set the ZF if the result is all zeroes.
5705 ctx.emit(Inst::xmm_cmp_rm_r(
5706 SseOpcode::Ptest,
5707 RegMem::from(tmp),
5708 tmp.to_reg(),
5709 ));
5710 // If the ZF is set, place a 1 in `dst`.
5711 ctx.emit(Inst::setcc(CC::Z, dst));
5712 }
5713
5714 Opcode::VhighBits => {
5715 let src = put_input_in_reg(ctx, inputs[0]);
5716 let src_ty = ctx.input_ty(insn, 0);
5717 debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
5718 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5719 debug_assert!(dst.to_reg().get_class() == RegClass::I64);
5720
5721 // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
5722 // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
5723 // the instruction can access additional registers when used with a REX.R prefix. The
5724 // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
5725 // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
5726 // unnecessary (`OperandSize` is used for setting/clearing REX.W).
5727 let size = OperandSize::Size32;
5728
5729 match src_ty {
5730 types::I8X16 | types::B8X16 => {
5731 ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
5732 }
5733 types::I32X4 | types::B32X4 | types::F32X4 => {
5734 ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
5735 }
5736 types::I64X2 | types::B64X2 | types::F64X2 => {
5737 ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
5738 }
5739 types::I16X8 | types::B16X8 => {
5740 // There is no x86 instruction for extracting the high bit of 16-bit lanes so
5741 // here we:
5742 // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
5743 // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
5744 // - use PMOVMSKB to gather the high bits; now we have duplicates, though
5745 // - shift away the bottom 8 high bits to remove the duplicates.
5746 let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
5747 ctx.emit(Inst::gen_move(tmp, src, src_ty));
5748 ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
5749 ctx.emit(Inst::xmm_to_gpr(
5750 SseOpcode::Pmovmskb,
5751 tmp.to_reg(),
5752 dst,
5753 size,
5754 ));
5755 ctx.emit(Inst::shift_r(
5756 OperandSize::Size64,
5757 ShiftKind::ShiftRightLogical,
5758 Some(8),
5759 dst,
5760 ));
5761 }
5762 _ => unimplemented!("unknown input type {} for {}", src_ty, op),
5763 }
5764 }
5765
5766 Opcode::Iconcat => {
5767 let ty = ctx.output_ty(insn, 0);
5768 assert_eq!(
5769 ty,
5770 types::I128,
5771 "Iconcat not expected to be used for non-128-bit type"
5772 );
5773 assert_eq!(ctx.input_ty(insn, 0), types::I64);
5774 assert_eq!(ctx.input_ty(insn, 1), types::I64);
5775 let lo = put_input_in_reg(ctx, inputs[0]);
5776 let hi = put_input_in_reg(ctx, inputs[1]);
5777 let dst = get_output_reg(ctx, outputs[0]);
5778 ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
5779 ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
5780 }
5781
5782 Opcode::Isplit => {
5783 let ty = ctx.input_ty(insn, 0);
5784 assert_eq!(
5785 ty,
5786 types::I128,
5787 "Iconcat not expected to be used for non-128-bit type"
5788 );
5789 assert_eq!(ctx.output_ty(insn, 0), types::I64);
5790 assert_eq!(ctx.output_ty(insn, 1), types::I64);
5791 let src = put_input_in_regs(ctx, inputs[0]);
5792 let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5793 let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
5794 ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
5795 ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
5796 }
5797
5798 Opcode::TlsValue => match flags.tls_model() {
5799 TlsModel::ElfGd => {
5800 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5801 let (name, _, _) = ctx.symbol_value(insn).unwrap();
5802 let symbol = name.clone();
5803 ctx.emit(Inst::ElfTlsGetAddr { symbol });
5804 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5805 }
5806 TlsModel::Macho => {
5807 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
5808 let (name, _, _) = ctx.symbol_value(insn).unwrap();
5809 let symbol = name.clone();
5810 ctx.emit(Inst::MachOTlsGetAddr { symbol });
5811 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
5812 }
5813 _ => {
5814 todo!(
5815 "Unimplemented TLS model in x64 backend: {:?}",
5816 flags.tls_model()
5817 );
5818 }
5819 },
5820
5821 Opcode::IaddImm
5822 | Opcode::ImulImm
5823 | Opcode::UdivImm
5824 | Opcode::SdivImm
5825 | Opcode::UremImm
5826 | Opcode::SremImm
5827 | Opcode::IrsubImm
5828 | Opcode::IaddCin
5829 | Opcode::IaddIfcin
5830 | Opcode::IaddCout
5831 | Opcode::IaddCarry
5832 | Opcode::IaddIfcarry
5833 | Opcode::IsubBin
5834 | Opcode::IsubIfbin
5835 | Opcode::IsubBout
5836 | Opcode::IsubIfbout
5837 | Opcode::IsubBorrow
5838 | Opcode::IsubIfborrow
5839 | Opcode::BandImm
5840 | Opcode::BorImm
5841 | Opcode::BxorImm
5842 | Opcode::RotlImm
5843 | Opcode::RotrImm
5844 | Opcode::IshlImm
5845 | Opcode::UshrImm
5846 | Opcode::SshrImm => {
5847 panic!("ALU+imm and ALU+carry ops should not appear here!");
5848 }
5849 _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
5850 }
5851
5852 Ok(())
5853 }
5854
5855 //=============================================================================
5856 // Lowering-backend trait implementation.
5857
5858 impl LowerBackend for X64Backend {
5859 type MInst = Inst;
5860
lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()>5861 fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
5862 lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
5863 }
5864
lower_branch_group<C: LowerCtx<I = Inst>>( &self, ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>5865 fn lower_branch_group<C: LowerCtx<I = Inst>>(
5866 &self,
5867 ctx: &mut C,
5868 branches: &[IRInst],
5869 targets: &[MachLabel],
5870 ) -> CodegenResult<()> {
5871 // A block should end with at most two branches. The first may be a
5872 // conditional branch; a conditional branch can be followed only by an
5873 // unconditional branch or fallthrough. Otherwise, if only one branch,
5874 // it may be an unconditional branch, a fallthrough, a return, or a
5875 // trap. These conditions are verified by `is_ebb_basic()` during the
5876 // verifier pass.
5877 assert!(branches.len() <= 2);
5878
5879 if branches.len() == 2 {
5880 // Must be a conditional branch followed by an unconditional branch.
5881 let op0 = ctx.data(branches[0]).opcode();
5882 let op1 = ctx.data(branches[1]).opcode();
5883
5884 trace!(
5885 "lowering two-branch group: opcodes are {:?} and {:?}",
5886 op0,
5887 op1
5888 );
5889 assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
5890
5891 let taken = targets[0];
5892 // not_taken target is the target of the second branch, even if it is a Fallthrough
5893 // instruction: because we reorder blocks while we lower, the fallthrough in the new
5894 // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
5895 // explicitly-provided target.
5896 let not_taken = targets[1];
5897
5898 match op0 {
5899 Opcode::Brz | Opcode::Brnz => {
5900 let flag_input = InsnInput {
5901 insn: branches[0],
5902 input: 0,
5903 };
5904
5905 let src_ty = ctx.input_ty(branches[0], 0);
5906
5907 if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
5908 let cond_code = ctx.data(icmp).cond_code().unwrap();
5909 let cond_code = emit_cmp(ctx, icmp, cond_code);
5910
5911 let cond_code = if op0 == Opcode::Brz {
5912 cond_code.inverse()
5913 } else {
5914 cond_code
5915 };
5916
5917 let cc = CC::from_intcc(cond_code);
5918 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
5919 } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
5920 let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
5921 let cond_code = if op0 == Opcode::Brz {
5922 cond_code.inverse()
5923 } else {
5924 cond_code
5925 };
5926 match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
5927 FcmpCondResult::Condition(cc) => {
5928 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
5929 }
5930 FcmpCondResult::AndConditions(cc1, cc2) => {
5931 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
5932 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
5933 }
5934 FcmpCondResult::OrConditions(cc1, cc2) => {
5935 ctx.emit(Inst::jmp_if(cc1, taken));
5936 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
5937 }
5938 FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
5939 }
5940 } else if src_ty == types::I128 {
5941 let src = put_input_in_regs(
5942 ctx,
5943 InsnInput {
5944 insn: branches[0],
5945 input: 0,
5946 },
5947 );
5948 let (half_cc, comb_op) = match op0 {
5949 Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
5950 Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
5951 _ => unreachable!(),
5952 };
5953 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5954 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
5955 ctx.emit(Inst::cmp_rmi_r(
5956 OperandSize::Size64,
5957 RegMemImm::imm(0),
5958 src.regs()[0],
5959 ));
5960 ctx.emit(Inst::setcc(half_cc, tmp1));
5961 ctx.emit(Inst::cmp_rmi_r(
5962 OperandSize::Size64,
5963 RegMemImm::imm(0),
5964 src.regs()[1],
5965 ));
5966 ctx.emit(Inst::setcc(half_cc, tmp2));
5967 ctx.emit(Inst::alu_rmi_r(
5968 OperandSize::Size32,
5969 comb_op,
5970 RegMemImm::reg(tmp1.to_reg()),
5971 tmp2,
5972 ));
5973 ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
5974 } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
5975 let src = put_input_in_reg(
5976 ctx,
5977 InsnInput {
5978 insn: branches[0],
5979 input: 0,
5980 },
5981 );
5982 let cc = match op0 {
5983 Opcode::Brz => CC::Z,
5984 Opcode::Brnz => CC::NZ,
5985 _ => unreachable!(),
5986 };
5987 // See case for `Opcode::Select` above re: testing the
5988 // boolean input.
5989 let test_input = if src_ty == types::B1 {
5990 // test src, 1
5991 RegMemImm::imm(1)
5992 } else {
5993 assert!(!is_bool_ty(src_ty));
5994 // test src, src
5995 RegMemImm::reg(src)
5996 };
5997
5998 ctx.emit(Inst::test_rmi_r(
5999 OperandSize::from_ty(src_ty),
6000 test_input,
6001 src,
6002 ));
6003 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6004 } else {
6005 unimplemented!("brz/brnz with non-int type {:?}", src_ty);
6006 }
6007 }
6008
6009 Opcode::BrIcmp => {
6010 let src_ty = ctx.input_ty(branches[0], 0);
6011 if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
6012 let lhs = put_input_in_reg(
6013 ctx,
6014 InsnInput {
6015 insn: branches[0],
6016 input: 0,
6017 },
6018 );
6019 let rhs = input_to_reg_mem_imm(
6020 ctx,
6021 InsnInput {
6022 insn: branches[0],
6023 input: 1,
6024 },
6025 );
6026 let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap());
6027 // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
6028 // us dst - src at the machine instruction level, so invert operands.
6029 ctx.emit(Inst::cmp_rmi_r(OperandSize::from_ty(src_ty), rhs, lhs));
6030 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6031 } else {
6032 unimplemented!("bricmp with non-int type {:?}", src_ty);
6033 }
6034 }
6035
6036 Opcode::Brif => {
6037 let flag_input = InsnInput {
6038 insn: branches[0],
6039 input: 0,
6040 };
6041
6042 if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) {
6043 let cond_code = ctx.data(branches[0]).cond_code().unwrap();
6044 let cond_code = emit_cmp(ctx, ifcmp, cond_code);
6045 let cc = CC::from_intcc(cond_code);
6046 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6047 } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) {
6048 let operand = put_input_in_reg(
6049 ctx,
6050 InsnInput {
6051 insn: ifcmp_sp,
6052 input: 0,
6053 },
6054 );
6055 let ty = ctx.input_ty(ifcmp_sp, 0);
6056 ctx.emit(Inst::cmp_rmi_r(
6057 OperandSize::from_ty(ty),
6058 RegMemImm::reg(regs::rsp()),
6059 operand,
6060 ));
6061 let cond_code = ctx.data(branches[0]).cond_code().unwrap();
6062 let cc = CC::from_intcc(cond_code);
6063 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6064 } else {
6065 // Should be disallowed by flags checks in verifier.
6066 unimplemented!("Brif with non-ifcmp input");
6067 }
6068 }
6069 Opcode::Brff => {
6070 let flag_input = InsnInput {
6071 insn: branches[0],
6072 input: 0,
6073 };
6074
6075 if let Some(ffcmp) = matches_input(ctx, flag_input, Opcode::Ffcmp) {
6076 let cond_code = ctx.data(branches[0]).fp_cond_code().unwrap();
6077 match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
6078 FcmpCondResult::Condition(cc) => {
6079 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
6080 }
6081 FcmpCondResult::AndConditions(cc1, cc2) => {
6082 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
6083 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
6084 }
6085 FcmpCondResult::OrConditions(cc1, cc2) => {
6086 ctx.emit(Inst::jmp_if(cc1, taken));
6087 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
6088 }
6089 FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
6090 }
6091 } else {
6092 // Should be disallowed by flags checks in verifier.
6093 unimplemented!("Brff with input not from ffcmp");
6094 }
6095 }
6096
6097 _ => panic!("unexpected branch opcode: {:?}", op0),
6098 }
6099 } else {
6100 assert_eq!(branches.len(), 1);
6101
6102 // Must be an unconditional branch or trap.
6103 let op = ctx.data(branches[0]).opcode();
6104 match op {
6105 Opcode::Jump | Opcode::Fallthrough => {
6106 ctx.emit(Inst::jmp_known(targets[0]));
6107 }
6108
6109 Opcode::BrTable => {
6110 let jt_size = targets.len() - 1;
6111 assert!(jt_size <= u32::max_value() as usize);
6112 let jt_size = jt_size as u32;
6113
6114 let idx = extend_input_to_reg(
6115 ctx,
6116 InsnInput {
6117 insn: branches[0],
6118 input: 0,
6119 },
6120 ExtSpec::ZeroExtendTo32,
6121 );
6122
6123 // Bounds-check (compute flags from idx - jt_size) and branch to default.
6124 ctx.emit(Inst::cmp_rmi_r(
6125 OperandSize::Size32,
6126 RegMemImm::imm(jt_size),
6127 idx,
6128 ));
6129
6130 // Emit the compound instruction that does:
6131 //
6132 // lea $jt, %rA
6133 // movsbl [%rA, %rIndex, 2], %rB
6134 // add %rB, %rA
6135 // j *%rA
6136 // [jt entries]
6137 //
6138 // This must be *one* instruction in the vcode because we cannot allow regalloc
6139 // to insert any spills/fills in the middle of the sequence; otherwise, the
6140 // lea PC-rel offset to the jumptable would be incorrect. (The alternative
6141 // is to introduce a relocation pass for inlined jumptables, which is much
6142 // worse.)
6143
6144 // This temporary is used as a signed integer of 64-bits (to hold addresses).
6145 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
6146 // This temporary is used as a signed integer of 32-bits (for the wasm-table
6147 // index) and then 64-bits (address addend). The small lie about the I64 type
6148 // is benign, since the temporary is dead after this instruction (and its
6149 // Cranelift type is thus unused).
6150 let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
6151
6152 let targets_for_term: Vec<MachLabel> = targets.to_vec();
6153 let default_target = targets[0];
6154
6155 let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect();
6156
6157 ctx.emit(Inst::JmpTableSeq {
6158 idx,
6159 tmp1,
6160 tmp2,
6161 default_target,
6162 targets: jt_targets,
6163 targets_for_term,
6164 });
6165 }
6166
6167 _ => panic!("Unknown branch type {:?}", op),
6168 }
6169 }
6170
6171 Ok(())
6172 }
6173
maybe_pinned_reg(&self) -> Option<Reg>6174 fn maybe_pinned_reg(&self) -> Option<Reg> {
6175 Some(regs::pinned_reg())
6176 }
6177 }
6178