1 //! Lower a single Cranelift instruction into vcode.
2
3 use crate::binemit::CodeOffset;
4 use crate::ir::condcodes::FloatCC;
5 use crate::ir::types::*;
6 use crate::ir::Inst as IRInst;
7 use crate::ir::{InstructionData, Opcode, TrapCode};
8 use crate::isa::aarch64::settings as aarch64_settings;
9 use crate::machinst::lower::*;
10 use crate::machinst::*;
11 use crate::settings::Flags;
12 use crate::{CodegenError, CodegenResult};
13
14 use crate::isa::aarch64::abi::*;
15 use crate::isa::aarch64::inst::*;
16
17 use regalloc::Writable;
18
19 use alloc::boxed::Box;
20 use alloc::vec::Vec;
21 use core::convert::TryFrom;
22
23 use super::lower::*;
24
25 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()>26 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
27 ctx: &mut C,
28 insn: IRInst,
29 flags: &Flags,
30 isa_flags: &aarch64_settings::Flags,
31 ) -> CodegenResult<()> {
32 let op = ctx.data(insn).opcode();
33 let inputs = insn_inputs(ctx, insn);
34 let outputs = insn_outputs(ctx, insn);
35 let ty = if outputs.len() > 0 {
36 Some(ctx.output_ty(insn, 0))
37 } else {
38 None
39 };
40
41 match op {
42 Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
43 let value = ctx.get_constant(insn).unwrap();
44 // Sign extend constant if necessary
45 let value = match ty.unwrap() {
46 I8 => (((value as i64) << 56) >> 56) as u64,
47 I16 => (((value as i64) << 48) >> 48) as u64,
48 I32 => (((value as i64) << 32) >> 32) as u64,
49 I64 | R64 => value,
50 ty if ty.is_bool() => value,
51 ty => unreachable!("Unknown type for const: {}", ty),
52 };
53 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
54 lower_constant_u64(ctx, rd, value);
55 }
56 Opcode::F32const => {
57 let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
58 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
59 lower_constant_f32(ctx, rd, value);
60 }
61 Opcode::F64const => {
62 let value = f64::from_bits(ctx.get_constant(insn).unwrap());
63 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
64 lower_constant_f64(ctx, rd, value);
65 }
66 Opcode::Iadd => {
67 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
68 let ty = ty.unwrap();
69 if !ty.is_vector() {
70 let mul_insn =
71 if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) {
72 Some((mul_insn, 0))
73 } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
74 Some((mul_insn, 1))
75 } else {
76 None
77 };
78 // If possible combine mul + add into madd.
79 if let Some((insn, addend_idx)) = mul_insn {
80 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
81 let rn_input = InsnInput { insn, input: 0 };
82 let rm_input = InsnInput { insn, input: 1 };
83
84 let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
85 let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
86 let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
87
88 ctx.emit(Inst::AluRRRR {
89 alu_op,
90 rd,
91 rn,
92 rm,
93 ra,
94 });
95 } else {
96 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
97 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
98 ctx,
99 inputs[1],
100 ty_bits(ty),
101 NarrowValueMode::None,
102 );
103 let alu_op = if !negated {
104 choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
105 } else {
106 choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
107 };
108 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
109 }
110 } else {
111 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
112 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
113 ctx.emit(Inst::VecRRR {
114 rd,
115 rn,
116 rm,
117 alu_op: VecALUOp::Add,
118 size: VectorSize::from_ty(ty),
119 });
120 }
121 }
122 Opcode::Isub => {
123 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
124 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
125 let ty = ty.unwrap();
126 if !ty.is_vector() {
127 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
128 ctx,
129 inputs[1],
130 ty_bits(ty),
131 NarrowValueMode::None,
132 );
133 let alu_op = if !negated {
134 choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
135 } else {
136 choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
137 };
138 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
139 } else {
140 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
141 ctx.emit(Inst::VecRRR {
142 rd,
143 rn,
144 rm,
145 alu_op: VecALUOp::Sub,
146 size: VectorSize::from_ty(ty),
147 });
148 }
149 }
150 Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
151 let ty = ty.unwrap();
152 assert!(ty.is_vector());
153 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
154 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
155 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
156
157 let alu_op = match op {
158 Opcode::UaddSat => VecALUOp::Uqadd,
159 Opcode::SaddSat => VecALUOp::Sqadd,
160 Opcode::UsubSat => VecALUOp::Uqsub,
161 Opcode::SsubSat => VecALUOp::Sqsub,
162 _ => unreachable!(),
163 };
164
165 ctx.emit(Inst::VecRRR {
166 rd,
167 rn,
168 rm,
169 alu_op,
170 size: VectorSize::from_ty(ty),
171 });
172 }
173
174 Opcode::Ineg => {
175 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
176 let ty = ty.unwrap();
177 if !ty.is_vector() {
178 let rn = zero_reg();
179 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
180 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
181 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
182 } else {
183 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
184 ctx.emit(Inst::VecMisc {
185 op: VecMisc2::Neg,
186 rd,
187 rn,
188 size: VectorSize::from_ty(ty),
189 });
190 }
191 }
192
193 Opcode::Imul => {
194 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
195 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
196 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
197 let ty = ty.unwrap();
198 if !ty.is_vector() {
199 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
200 ctx.emit(Inst::AluRRRR {
201 alu_op,
202 rd,
203 rn,
204 rm,
205 ra: zero_reg(),
206 });
207 } else {
208 if ty == I64X2 {
209 let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
210 let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
211
212 // This I64X2 multiplication is performed with several 32-bit
213 // operations.
214
215 // 64-bit numbers x and y, can be represented as:
216 // x = a + 2^32(b)
217 // y = c + 2^32(d)
218
219 // A 64-bit multiplication is:
220 // x * y = ac + 2^32(ad + bc) + 2^64(bd)
221 // note: `2^64(bd)` can be ignored, the value is too large to fit in
222 // 64 bits.
223
224 // This sequence implements a I64X2 multiply, where the registers
225 // `rn` and `rm` are split up into 32-bit components:
226 // rn = |d|c|b|a|
227 // rm = |h|g|f|e|
228 //
229 // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
230 //
231 // The sequence is:
232 // rev64 rd.4s, rm.4s
233 // mul rd.4s, rd.4s, rn.4s
234 // xtn tmp1.2s, rn.2d
235 // addp rd.4s, rd.4s, rd.4s
236 // xtn tmp2.2s, rm.2d
237 // shll rd.2d, rd.2s, #32
238 // umlal rd.2d, tmp2.2s, tmp1.2s
239
240 // Reverse the 32-bit elements in the 64-bit words.
241 // rd = |g|h|e|f|
242 ctx.emit(Inst::VecMisc {
243 op: VecMisc2::Rev64,
244 rd,
245 rn: rm,
246 size: VectorSize::Size32x4,
247 });
248
249 // Calculate the high half components.
250 // rd = |dg|ch|be|af|
251 //
252 // Note that this 32-bit multiply of the high half
253 // discards the bits that would overflow, same as
254 // if 64-bit operations were used. Also the Shll
255 // below would shift out the overflow bits anyway.
256 ctx.emit(Inst::VecRRR {
257 alu_op: VecALUOp::Mul,
258 rd,
259 rn: rd.to_reg(),
260 rm: rn,
261 size: VectorSize::Size32x4,
262 });
263
264 // Extract the low half components of rn.
265 // tmp1 = |c|a|
266 ctx.emit(Inst::VecMiscNarrow {
267 op: VecMiscNarrowOp::Xtn,
268 rd: tmp1,
269 rn,
270 size: VectorSize::Size32x2,
271 high_half: false,
272 });
273
274 // Sum the respective high half components.
275 // rd = |dg+ch|be+af||dg+ch|be+af|
276 ctx.emit(Inst::VecRRR {
277 alu_op: VecALUOp::Addp,
278 rd: rd,
279 rn: rd.to_reg(),
280 rm: rd.to_reg(),
281 size: VectorSize::Size32x4,
282 });
283
284 // Extract the low half components of rm.
285 // tmp2 = |g|e|
286 ctx.emit(Inst::VecMiscNarrow {
287 op: VecMiscNarrowOp::Xtn,
288 rd: tmp2,
289 rn: rm,
290 size: VectorSize::Size32x2,
291 high_half: false,
292 });
293
294 // Shift the high half components, into the high half.
295 // rd = |dg+ch << 32|be+af << 32|
296 ctx.emit(Inst::VecMisc {
297 op: VecMisc2::Shll,
298 rd,
299 rn: rd.to_reg(),
300 size: VectorSize::Size32x2,
301 });
302
303 // Multiply the low components together, and accumulate with the high
304 // half.
305 // rd = |rd[1] + cg|rd[0] + ae|
306 ctx.emit(Inst::VecRRR {
307 alu_op: VecALUOp::Umlal,
308 rd,
309 rn: tmp2.to_reg(),
310 rm: tmp1.to_reg(),
311 size: VectorSize::Size32x2,
312 });
313 } else {
314 ctx.emit(Inst::VecRRR {
315 alu_op: VecALUOp::Mul,
316 rd,
317 rn,
318 rm,
319 size: VectorSize::from_ty(ty),
320 });
321 }
322 }
323 }
324
325 Opcode::Umulhi | Opcode::Smulhi => {
326 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
327 let is_signed = op == Opcode::Smulhi;
328 let input_ty = ctx.input_ty(insn, 0);
329 assert!(ctx.input_ty(insn, 1) == input_ty);
330 assert!(ctx.output_ty(insn, 0) == input_ty);
331
332 match input_ty {
333 I64 => {
334 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
335 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
336 let alu_op = if is_signed {
337 ALUOp::SMulH
338 } else {
339 ALUOp::UMulH
340 };
341 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
342 }
343 I32 | I16 | I8 => {
344 let narrow_mode = if is_signed {
345 NarrowValueMode::SignExtend64
346 } else {
347 NarrowValueMode::ZeroExtend64
348 };
349 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
350 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
351 let ra = zero_reg();
352 ctx.emit(Inst::AluRRRR {
353 alu_op: ALUOp3::MAdd64,
354 rd,
355 rn,
356 rm,
357 ra,
358 });
359 let shift_op = if is_signed {
360 ALUOp::Asr64
361 } else {
362 ALUOp::Lsr64
363 };
364 let shift_amt = match input_ty {
365 I32 => 32,
366 I16 => 16,
367 I8 => 8,
368 _ => unreachable!(),
369 };
370 ctx.emit(Inst::AluRRImmShift {
371 alu_op: shift_op,
372 rd,
373 rn: rd.to_reg(),
374 immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
375 });
376 }
377 _ => {
378 panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
379 }
380 }
381 }
382
383 Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
384 let is_signed = match op {
385 Opcode::Udiv | Opcode::Urem => false,
386 Opcode::Sdiv | Opcode::Srem => true,
387 _ => unreachable!(),
388 };
389 let is_rem = match op {
390 Opcode::Udiv | Opcode::Sdiv => false,
391 Opcode::Urem | Opcode::Srem => true,
392 _ => unreachable!(),
393 };
394 let narrow_mode = if is_signed {
395 NarrowValueMode::SignExtend64
396 } else {
397 NarrowValueMode::ZeroExtend64
398 };
399 // TODO: Add SDiv32 to implement 32-bit directly, rather
400 // than extending the input.
401 let div_op = if is_signed {
402 ALUOp::SDiv64
403 } else {
404 ALUOp::UDiv64
405 };
406
407 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
408 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
409 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
410 // The div instruction does not trap on divide by zero or signed overflow
411 // so checks are inserted below.
412 //
413 // div rd, rn, rm
414 ctx.emit(Inst::AluRRR {
415 alu_op: div_op,
416 rd,
417 rn,
418 rm,
419 });
420
421 if is_rem {
422 // Remainder (rn % rm) is implemented as:
423 //
424 // tmp = rn / rm
425 // rd = rn - (tmp*rm)
426 //
427 // use 'rd' for tmp and you have:
428 //
429 // div rd, rn, rm ; rd = rn / rm
430 // cbnz rm, #8 ; branch over trap
431 // udf ; divide by zero
432 // msub rd, rd, rm, rn ; rd = rn - rd * rm
433
434 // Check for divide by 0.
435 let trap_code = TrapCode::IntegerDivisionByZero;
436 ctx.emit(Inst::TrapIf {
437 trap_code,
438 kind: CondBrKind::Zero(rm),
439 });
440
441 ctx.emit(Inst::AluRRRR {
442 alu_op: ALUOp3::MSub64,
443 rd: rd,
444 rn: rd.to_reg(),
445 rm: rm,
446 ra: rn,
447 });
448 } else {
449 if div_op == ALUOp::SDiv64 {
450 // cbnz rm, #8
451 // udf ; divide by zero
452 // cmn rm, 1
453 // ccmp rn, 1, #nzcv, eq
454 // b.vc #8
455 // udf ; signed overflow
456
457 // Check for divide by 0.
458 let trap_code = TrapCode::IntegerDivisionByZero;
459 ctx.emit(Inst::TrapIf {
460 trap_code,
461 kind: CondBrKind::Zero(rm),
462 });
463
464 // Check for signed overflow. The only case is min_value / -1.
465 let ty = ty.unwrap();
466 // The following checks must be done in 32-bit or 64-bit, depending
467 // on the input type. Even though the initial div instruction is
468 // always done in 64-bit currently.
469 let size = OperandSize::from_ty(ty);
470 // Check RHS is -1.
471 ctx.emit(Inst::AluRRImm12 {
472 alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
473 rd: writable_zero_reg(),
474 rn: rm,
475 imm12: Imm12::maybe_from_u64(1).unwrap(),
476 });
477 // Check LHS is min_value, by subtracting 1 and branching if
478 // there is overflow.
479 ctx.emit(Inst::CCmpImm {
480 size,
481 rn,
482 imm: UImm5::maybe_from_u8(1).unwrap(),
483 nzcv: NZCV::new(false, false, false, false),
484 cond: Cond::Eq,
485 });
486 let trap_code = TrapCode::IntegerOverflow;
487 ctx.emit(Inst::TrapIf {
488 trap_code,
489 kind: CondBrKind::Cond(Cond::Vs),
490 });
491 } else {
492 // cbnz rm, #8
493 // udf ; divide by zero
494
495 // Check for divide by 0.
496 let trap_code = TrapCode::IntegerDivisionByZero;
497 ctx.emit(Inst::TrapIf {
498 trap_code,
499 kind: CondBrKind::Zero(rm),
500 });
501 }
502 }
503 }
504
505 Opcode::Uextend | Opcode::Sextend => {
506 let output_ty = ty.unwrap();
507 let input_ty = ctx.input_ty(insn, 0);
508 let from_bits = ty_bits(input_ty) as u8;
509 let to_bits = ty_bits(output_ty) as u8;
510 let to_bits = std::cmp::max(32, to_bits);
511 assert!(from_bits <= to_bits);
512 if from_bits < to_bits {
513 let signed = op == Opcode::Sextend;
514 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
515
516 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
517 let idx =
518 if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
519 *imm
520 } else {
521 unreachable!();
522 };
523 let input = InsnInput {
524 insn: extract_insn,
525 input: 0,
526 };
527 let rn = put_input_in_reg(ctx, input, NarrowValueMode::None);
528 let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
529
530 if signed {
531 let scalar_size = OperandSize::from_ty(output_ty);
532
533 ctx.emit(Inst::MovFromVecSigned {
534 rd,
535 rn,
536 idx,
537 size,
538 scalar_size,
539 });
540 } else {
541 ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
542 }
543 } else {
544 // If we reach this point, we weren't able to incorporate the extend as
545 // a register-mode on another instruction, so we have a 'None'
546 // narrow-value/extend mode here, and we emit the explicit instruction.
547 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
548 ctx.emit(Inst::Extend {
549 rd,
550 rn,
551 signed,
552 from_bits,
553 to_bits,
554 });
555 }
556 }
557 }
558
559 Opcode::Bnot => {
560 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
561 let ty = ty.unwrap();
562 if !ty.is_vector() {
563 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
564 let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
565 // NOT rd, rm ==> ORR_NOT rd, zero, rm
566 ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
567 } else {
568 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
569 ctx.emit(Inst::VecMisc {
570 op: VecMisc2::Not,
571 rd,
572 rn: rm,
573 size: VectorSize::from_ty(ty),
574 });
575 }
576 }
577
578 Opcode::Band
579 | Opcode::Bor
580 | Opcode::Bxor
581 | Opcode::BandNot
582 | Opcode::BorNot
583 | Opcode::BxorNot => {
584 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
585 let ty = ty.unwrap();
586 if !ty.is_vector() {
587 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
588 let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
589 let alu_op = match op {
590 Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
591 Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
592 Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
593 Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
594 Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
595 Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
596 _ => unreachable!(),
597 };
598 ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
599 } else {
600 let alu_op = match op {
601 Opcode::Band => VecALUOp::And,
602 Opcode::BandNot => VecALUOp::Bic,
603 Opcode::Bor => VecALUOp::Orr,
604 Opcode::Bxor => VecALUOp::Eor,
605 _ => unreachable!(),
606 };
607
608 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
609 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
610 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
611
612 ctx.emit(Inst::VecRRR {
613 alu_op,
614 rd,
615 rn,
616 rm,
617 size: VectorSize::from_ty(ty),
618 });
619 }
620 }
621
622 Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
623 let ty = ty.unwrap();
624 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
625 if !ty.is_vector() {
626 let size = OperandSize::from_bits(ty_bits(ty));
627 let narrow_mode = match (op, size) {
628 (Opcode::Ishl, _) => NarrowValueMode::None,
629 (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
630 (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
631 (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
632 (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
633 _ => unreachable!(),
634 };
635 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
636 let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
637 let alu_op = match op {
638 Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
639 Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
640 Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
641 _ => unreachable!(),
642 };
643 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
644 } else {
645 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
646 let size = VectorSize::from_ty(ty);
647 let (alu_op, is_right_shift) = match op {
648 Opcode::Ishl => (VecALUOp::Sshl, false),
649 Opcode::Ushr => (VecALUOp::Ushl, true),
650 Opcode::Sshr => (VecALUOp::Sshl, true),
651 _ => unreachable!(),
652 };
653
654 let rm = if is_right_shift {
655 // Right shifts are implemented with a negative left shift.
656 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
657 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
658 let rn = zero_reg();
659 ctx.emit(Inst::AluRRR {
660 alu_op: ALUOp::Sub32,
661 rd: tmp,
662 rn,
663 rm,
664 });
665 tmp.to_reg()
666 } else {
667 put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
668 };
669
670 ctx.emit(Inst::VecDup { rd, rn: rm, size });
671
672 ctx.emit(Inst::VecRRR {
673 alu_op,
674 rd,
675 rn,
676 rm: rd.to_reg(),
677 size,
678 });
679 }
680 }
681
682 Opcode::Rotr | Opcode::Rotl => {
683 // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
684 // effectively a right rotation of N - K places, if N is the integer's bit size. We
685 // implement left rotations with this trick.
686 //
687 // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
688 //
689 // For a < 32-bit rotate-right, we synthesize this as:
690 //
691 // rotr rd, rn, rm
692 //
693 // =>
694 //
695 // zero-extend rn, <32-or-64>
696 // and tmp_masked_rm, rm, <bitwidth - 1>
697 // sub tmp1, tmp_masked_rm, <bitwidth>
698 // sub tmp1, zero, tmp1 ; neg
699 // lsr tmp2, rn, tmp_masked_rm
700 // lsl rd, rn, tmp1
701 // orr rd, rd, tmp2
702 //
703 // For a constant amount, we can instead do:
704 //
705 // zero-extend rn, <32-or-64>
706 // lsr tmp2, rn, #<shiftimm>
707 // lsl rd, rn, <bitwidth - shiftimm>
708 // orr rd, rd, tmp2
709
710 let is_rotl = op == Opcode::Rotl;
711
712 let ty = ty.unwrap();
713 let ty_bits_size = ty_bits(ty) as u8;
714
715 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
716 let rn = put_input_in_reg(
717 ctx,
718 inputs[0],
719 if ty_bits_size <= 32 {
720 NarrowValueMode::ZeroExtend32
721 } else {
722 NarrowValueMode::ZeroExtend64
723 },
724 );
725 let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
726
727 if ty_bits_size == 32 || ty_bits_size == 64 {
728 let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
729 match rm {
730 ResultRegImmShift::ImmShift(mut immshift) => {
731 if is_rotl {
732 immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
733 }
734 immshift.imm &= ty_bits_size - 1;
735 ctx.emit(Inst::AluRRImmShift {
736 alu_op,
737 rd,
738 rn,
739 immshift,
740 });
741 }
742
743 ResultRegImmShift::Reg(rm) => {
744 let rm = if is_rotl {
745 // Really ty_bits_size - rn, but the upper bits of the result are
746 // ignored (because of the implicit masking done by the instruction),
747 // so this is equivalent to negating the input.
748 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
749 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
750 ctx.emit(Inst::AluRRR {
751 alu_op,
752 rd: tmp,
753 rn: zero_reg(),
754 rm,
755 });
756 tmp.to_reg()
757 } else {
758 rm
759 };
760 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
761 }
762 }
763 } else {
764 debug_assert!(ty_bits_size < 32);
765
766 match rm {
767 ResultRegImmShift::Reg(reg) => {
768 let reg = if is_rotl {
769 // Really ty_bits_size - rn, but the upper bits of the result are
770 // ignored (because of the implicit masking done by the instruction),
771 // so this is equivalent to negating the input.
772 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
773 ctx.emit(Inst::AluRRR {
774 alu_op: ALUOp::Sub32,
775 rd: tmp,
776 rn: zero_reg(),
777 rm: reg,
778 });
779 tmp.to_reg()
780 } else {
781 reg
782 };
783
784 // Explicitly mask the rotation count.
785 let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
786 ctx.emit(Inst::AluRRImmLogic {
787 alu_op: ALUOp::And32,
788 rd: tmp_masked_rm,
789 rn: reg,
790 imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
791 });
792 let tmp_masked_rm = tmp_masked_rm.to_reg();
793
794 let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
795 let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
796 ctx.emit(Inst::AluRRImm12 {
797 alu_op: ALUOp::Sub32,
798 rd: tmp1,
799 rn: tmp_masked_rm,
800 imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
801 });
802 ctx.emit(Inst::AluRRR {
803 alu_op: ALUOp::Sub32,
804 rd: tmp1,
805 rn: zero_reg(),
806 rm: tmp1.to_reg(),
807 });
808 ctx.emit(Inst::AluRRR {
809 alu_op: ALUOp::Lsr32,
810 rd: tmp2,
811 rn,
812 rm: tmp_masked_rm,
813 });
814 ctx.emit(Inst::AluRRR {
815 alu_op: ALUOp::Lsl32,
816 rd,
817 rn,
818 rm: tmp1.to_reg(),
819 });
820 ctx.emit(Inst::AluRRR {
821 alu_op: ALUOp::Orr32,
822 rd,
823 rn: rd.to_reg(),
824 rm: tmp2.to_reg(),
825 });
826 }
827
828 ResultRegImmShift::ImmShift(mut immshift) => {
829 if is_rotl {
830 immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
831 }
832 immshift.imm &= ty_bits_size - 1;
833
834 let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
835 ctx.emit(Inst::AluRRImmShift {
836 alu_op: ALUOp::Lsr32,
837 rd: tmp1,
838 rn,
839 immshift: immshift.clone(),
840 });
841
842 let amount = immshift.value() & (ty_bits_size - 1);
843 let opp_shift =
844 ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
845 ctx.emit(Inst::AluRRImmShift {
846 alu_op: ALUOp::Lsl32,
847 rd,
848 rn,
849 immshift: opp_shift,
850 });
851
852 ctx.emit(Inst::AluRRR {
853 alu_op: ALUOp::Orr32,
854 rd,
855 rn: rd.to_reg(),
856 rm: tmp1.to_reg(),
857 });
858 }
859 }
860 }
861 }
862
863 Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
864 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
865 let needs_zext = match op {
866 Opcode::Bitrev | Opcode::Ctz => false,
867 Opcode::Clz | Opcode::Cls => true,
868 _ => unreachable!(),
869 };
870 let ty = ty.unwrap();
871 let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
872 NarrowValueMode::ZeroExtend64
873 } else if needs_zext {
874 NarrowValueMode::ZeroExtend32
875 } else {
876 NarrowValueMode::None
877 };
878 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
879 let op_ty = match ty {
880 I8 | I16 | I32 => I32,
881 I64 => I64,
882 _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
883 };
884 let bitop = match op {
885 Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
886 Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
887 _ => unreachable!(),
888 };
889 ctx.emit(Inst::BitRR { rd, rn, op: bitop });
890
891 // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
892 // to a clz, and bitrev as the main operation.
893 if op == Opcode::Bitrev || op == Opcode::Ctz {
894 // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
895 // the reversed result in the highest n bits, so we need to shift them down into
896 // place.
897 let right_shift = match ty {
898 I8 => Some(24),
899 I16 => Some(16),
900 I32 => None,
901 I64 => None,
902 _ => panic!("Unsupported type for Bitrev"),
903 };
904 if let Some(s) = right_shift {
905 ctx.emit(Inst::AluRRImmShift {
906 alu_op: ALUOp::Lsr32,
907 rd,
908 rn: rd.to_reg(),
909 immshift: ImmShift::maybe_from_u64(s).unwrap(),
910 });
911 }
912 }
913
914 if op == Opcode::Ctz {
915 ctx.emit(Inst::BitRR {
916 op: BitOp::from((Opcode::Clz, op_ty)),
917 rd,
918 rn: rd.to_reg(),
919 });
920 }
921 }
922
923 Opcode::Popcnt => {
924 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
925 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
926 let ty = ty.unwrap();
927 let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
928 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
929
930 // fmov tmp, rn
931 // cnt tmp.8b, tmp.8b
932 // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
933 // umov rd, tmp.b[0]
934
935 ctx.emit(Inst::MovToFpu {
936 rd: tmp,
937 rn: rn,
938 size,
939 });
940 ctx.emit(Inst::VecMisc {
941 op: VecMisc2::Cnt,
942 rd: tmp,
943 rn: tmp.to_reg(),
944 size: VectorSize::Size8x8,
945 });
946
947 match ScalarSize::from_ty(ty) {
948 ScalarSize::Size8 => {}
949 ScalarSize::Size16 => {
950 // ADDP is usually cheaper than ADDV.
951 ctx.emit(Inst::VecRRR {
952 alu_op: VecALUOp::Addp,
953 rd: tmp,
954 rn: tmp.to_reg(),
955 rm: tmp.to_reg(),
956 size: VectorSize::Size8x8,
957 });
958 }
959 ScalarSize::Size32 | ScalarSize::Size64 => {
960 ctx.emit(Inst::VecLanes {
961 op: VecLanesOp::Addv,
962 rd: tmp,
963 rn: tmp.to_reg(),
964 size: VectorSize::Size8x8,
965 });
966 }
967 sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
968 }
969
970 ctx.emit(Inst::MovFromVec {
971 rd,
972 rn: tmp.to_reg(),
973 idx: 0,
974 size: VectorSize::Size8x16,
975 });
976 }
977
978 Opcode::Load
979 | Opcode::Uload8
980 | Opcode::Sload8
981 | Opcode::Uload16
982 | Opcode::Sload16
983 | Opcode::Uload32
984 | Opcode::Sload32
985 | Opcode::LoadComplex
986 | Opcode::Uload8Complex
987 | Opcode::Sload8Complex
988 | Opcode::Uload16Complex
989 | Opcode::Sload16Complex
990 | Opcode::Uload32Complex
991 | Opcode::Sload32Complex
992 | Opcode::Sload8x8
993 | Opcode::Uload8x8
994 | Opcode::Sload16x4
995 | Opcode::Uload16x4
996 | Opcode::Sload32x2
997 | Opcode::Uload32x2
998 | Opcode::Uload8x8Complex
999 | Opcode::Sload8x8Complex
1000 | Opcode::Uload16x4Complex
1001 | Opcode::Sload16x4Complex
1002 | Opcode::Uload32x2Complex
1003 | Opcode::Sload32x2Complex => {
1004 let sign_extend = match op {
1005 Opcode::Sload8
1006 | Opcode::Sload8Complex
1007 | Opcode::Sload16
1008 | Opcode::Sload16Complex
1009 | Opcode::Sload32
1010 | Opcode::Sload32Complex => true,
1011 _ => false,
1012 };
1013 let flags = ctx
1014 .memflags(insn)
1015 .expect("Load instruction should have memflags");
1016
1017 lower_load(
1018 ctx,
1019 insn,
1020 &inputs[..],
1021 outputs[0],
1022 |ctx, rd, elem_ty, mem| {
1023 let is_float = ty_has_float_or_vec_representation(elem_ty);
1024 ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
1025 (1, _, _) => Inst::ULoad8 { rd, mem, flags },
1026 (8, false, _) => Inst::ULoad8 { rd, mem, flags },
1027 (8, true, _) => Inst::SLoad8 { rd, mem, flags },
1028 (16, false, _) => Inst::ULoad16 { rd, mem, flags },
1029 (16, true, _) => Inst::SLoad16 { rd, mem, flags },
1030 (32, false, false) => Inst::ULoad32 { rd, mem, flags },
1031 (32, true, false) => Inst::SLoad32 { rd, mem, flags },
1032 (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
1033 (64, _, false) => Inst::ULoad64 { rd, mem, flags },
1034 // Note that we treat some of the vector loads as scalar floating-point loads,
1035 // which is correct in a little endian environment.
1036 (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
1037 (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
1038 _ => panic!("Unsupported size in load"),
1039 });
1040
1041 let vec_extend = match op {
1042 Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
1043 Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
1044 Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
1045 Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
1046 Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
1047 Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
1048 Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
1049 Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
1050 Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
1051 Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
1052 Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
1053 Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
1054 _ => None,
1055 };
1056
1057 if let Some(t) = vec_extend {
1058 ctx.emit(Inst::VecExtend {
1059 t,
1060 rd,
1061 rn: rd.to_reg(),
1062 high_half: false,
1063 });
1064 }
1065 },
1066 );
1067 }
1068
1069 Opcode::Store
1070 | Opcode::Istore8
1071 | Opcode::Istore16
1072 | Opcode::Istore32
1073 | Opcode::StoreComplex
1074 | Opcode::Istore8Complex
1075 | Opcode::Istore16Complex
1076 | Opcode::Istore32Complex => {
1077 let off = ctx.data(insn).load_store_offset().unwrap();
1078 let elem_ty = match op {
1079 Opcode::Istore8 | Opcode::Istore8Complex => I8,
1080 Opcode::Istore16 | Opcode::Istore16Complex => I16,
1081 Opcode::Istore32 | Opcode::Istore32Complex => I32,
1082 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
1083 _ => unreachable!(),
1084 };
1085 let is_float = ty_has_float_or_vec_representation(elem_ty);
1086 let flags = ctx
1087 .memflags(insn)
1088 .expect("Store instruction should have memflags");
1089
1090 let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
1091 let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1092
1093 ctx.emit(match (ty_bits(elem_ty), is_float) {
1094 (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
1095 (16, _) => Inst::Store16 { rd, mem, flags },
1096 (32, false) => Inst::Store32 { rd, mem, flags },
1097 (32, true) => Inst::FpuStore32 { rd, mem, flags },
1098 (64, false) => Inst::Store64 { rd, mem, flags },
1099 (64, true) => Inst::FpuStore64 { rd, mem, flags },
1100 (128, _) => Inst::FpuStore128 { rd, mem, flags },
1101 _ => panic!("Unsupported size in store"),
1102 });
1103 }
1104
1105 Opcode::StackAddr => {
1106 let (stack_slot, offset) = match *ctx.data(insn) {
1107 InstructionData::StackLoad {
1108 opcode: Opcode::StackAddr,
1109 stack_slot,
1110 offset,
1111 } => (stack_slot, offset),
1112 _ => unreachable!(),
1113 };
1114 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1115 let offset: i32 = offset.into();
1116 let inst = ctx
1117 .abi()
1118 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
1119 ctx.emit(inst);
1120 }
1121
1122 Opcode::AtomicRmw => {
1123 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1124 let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1125 let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1126 let ty_access = ty.unwrap();
1127 assert!(is_valid_atomic_transaction_ty(ty_access));
1128 // Make sure that both args are in virtual regs, since in effect
1129 // we have to do a parallel copy to get them safely to the AtomicRMW input
1130 // regs, and that's not guaranteed safe if either is in a real reg.
1131 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1132 r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
1133 // Move the args to the preordained AtomicRMW input regs
1134 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1135 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
1136 // Now the AtomicRMW insn itself
1137 let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
1138 ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
1139 // And finally, copy the preordained AtomicRMW output reg to its destination.
1140 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1141 // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
1142 }
1143
1144 Opcode::AtomicCas => {
1145 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1146 let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1147 let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1148 let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1149 let ty_access = ty.unwrap();
1150 assert!(is_valid_atomic_transaction_ty(ty_access));
1151
1152 if isa_flags.use_lse() {
1153 ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
1154 ctx.emit(Inst::AtomicCAS {
1155 rs: r_dst,
1156 rt: r_replacement,
1157 rn: r_addr,
1158 ty: ty_access,
1159 });
1160 } else {
1161 // This is very similar to, but not identical to, the AtomicRmw case. Note
1162 // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
1163 // about zero-extending narrow (I8/I16/I32) values here.
1164 // Make sure that all three args are in virtual regs. See corresponding comment
1165 // for `Opcode::AtomicRmw` above.
1166 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1167 r_expected = ctx.ensure_in_vreg(r_expected, I64);
1168 r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
1169 // Move the args to the preordained AtomicCASLoop input regs
1170 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1171 ctx.emit(Inst::gen_move(
1172 Writable::from_reg(xreg(26)),
1173 r_expected,
1174 I64,
1175 ));
1176 ctx.emit(Inst::gen_move(
1177 Writable::from_reg(xreg(28)),
1178 r_replacement,
1179 I64,
1180 ));
1181 // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
1182 ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
1183 // And finally, copy the preordained AtomicCASLoop output reg to its destination.
1184 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1185 // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
1186 }
1187 }
1188
1189 Opcode::AtomicLoad => {
1190 let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1191 let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1192 let ty_access = ty.unwrap();
1193 assert!(is_valid_atomic_transaction_ty(ty_access));
1194 ctx.emit(Inst::AtomicLoad {
1195 ty: ty_access,
1196 r_data,
1197 r_addr,
1198 });
1199 }
1200
1201 Opcode::AtomicStore => {
1202 let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1203 let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1204 let ty_access = ctx.input_ty(insn, 0);
1205 assert!(is_valid_atomic_transaction_ty(ty_access));
1206 ctx.emit(Inst::AtomicStore {
1207 ty: ty_access,
1208 r_data,
1209 r_addr,
1210 });
1211 }
1212
1213 Opcode::Fence => {
1214 ctx.emit(Inst::Fence {});
1215 }
1216
1217 Opcode::StackLoad | Opcode::StackStore => {
1218 panic!("Direct stack memory access not supported; should not be used by Wasm");
1219 }
1220
1221 Opcode::HeapAddr => {
1222 panic!("heap_addr should have been removed by legalization!");
1223 }
1224
1225 Opcode::TableAddr => {
1226 panic!("table_addr should have been removed by legalization!");
1227 }
1228
1229 Opcode::ConstAddr => unimplemented!(),
1230
1231 Opcode::Nop => {
1232 // Nothing.
1233 }
1234
1235 Opcode::Select => {
1236 let flag_input = inputs[0];
1237 let cond = if let Some(icmp_insn) =
1238 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
1239 {
1240 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
1241 let cond = lower_condcode(condcode);
1242 let is_signed = condcode_is_signed(condcode);
1243 lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
1244 cond
1245 } else if let Some(fcmp_insn) =
1246 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
1247 {
1248 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
1249 let cond = lower_fp_condcode(condcode);
1250 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
1251 cond
1252 } else {
1253 let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
1254 (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
1255 } else {
1256 (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
1257 };
1258
1259 let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
1260 // cmp rcond, #0
1261 ctx.emit(Inst::AluRRR {
1262 alu_op: cmp_op,
1263 rd: writable_zero_reg(),
1264 rn: rcond,
1265 rm: zero_reg(),
1266 });
1267 Cond::Ne
1268 };
1269
1270 // csel.cond rd, rn, rm
1271 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1272 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1273 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1274 let ty = ctx.output_ty(insn, 0);
1275 let bits = ty_bits(ty);
1276 let is_float = ty_has_float_or_vec_representation(ty);
1277 if is_float && bits == 32 {
1278 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1279 } else if is_float && bits == 64 {
1280 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1281 } else if is_float && bits == 128 {
1282 ctx.emit(Inst::VecCSel { cond, rd, rn, rm });
1283 } else {
1284 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1285 }
1286 }
1287
1288 Opcode::Selectif | Opcode::SelectifSpectreGuard => {
1289 let condcode = ctx.data(insn).cond_code().unwrap();
1290 let cond = lower_condcode(condcode);
1291 let is_signed = condcode_is_signed(condcode);
1292 // Verification ensures that the input is always a
1293 // single-def ifcmp.
1294 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1295 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1296
1297 // csel.COND rd, rn, rm
1298 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1299 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1300 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1301 let ty = ctx.output_ty(insn, 0);
1302 let bits = ty_bits(ty);
1303 let is_float = ty_has_float_or_vec_representation(ty);
1304 if is_float && bits == 32 {
1305 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1306 } else if is_float && bits == 64 {
1307 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1308 } else {
1309 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1310 }
1311 }
1312
1313 Opcode::Bitselect | Opcode::Vselect => {
1314 let ty = ty.unwrap();
1315 if !ty.is_vector() {
1316 debug_assert_ne!(Opcode::Vselect, op);
1317 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1318 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1319 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1320 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1321 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1322 // AND rTmp, rn, rcond
1323 ctx.emit(Inst::AluRRR {
1324 alu_op: ALUOp::And64,
1325 rd: tmp,
1326 rn,
1327 rm: rcond,
1328 });
1329 // BIC rd, rm, rcond
1330 ctx.emit(Inst::AluRRR {
1331 alu_op: ALUOp::AndNot64,
1332 rd,
1333 rn: rm,
1334 rm: rcond,
1335 });
1336 // ORR rd, rd, rTmp
1337 ctx.emit(Inst::AluRRR {
1338 alu_op: ALUOp::Orr64,
1339 rd,
1340 rn: rd.to_reg(),
1341 rm: tmp.to_reg(),
1342 });
1343 } else {
1344 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1345 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1346 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1347 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1348 ctx.emit(Inst::gen_move(rd, rcond, ty));
1349
1350 ctx.emit(Inst::VecRRR {
1351 alu_op: VecALUOp::Bsl,
1352 rd,
1353 rn,
1354 rm,
1355 size: VectorSize::from_ty(ty),
1356 });
1357 }
1358 }
1359
1360 Opcode::Trueif => {
1361 let condcode = ctx.data(insn).cond_code().unwrap();
1362 let cond = lower_condcode(condcode);
1363 let is_signed = condcode_is_signed(condcode);
1364 // Verification ensures that the input is always a
1365 // single-def ifcmp.
1366 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1367 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1368 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1369 materialize_bool_result(ctx, insn, rd, cond);
1370 }
1371
1372 Opcode::Trueff => {
1373 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1374 let cond = lower_fp_condcode(condcode);
1375 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1376 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1377 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1378 materialize_bool_result(ctx, insn, rd, cond);
1379 }
1380
1381 Opcode::IsNull | Opcode::IsInvalid => {
1382 // Null references are represented by the constant value 0; invalid references are
1383 // represented by the constant value -1. See `define_reftypes()` in
1384 // `meta/src/isa/x86/encodings.rs` to confirm.
1385 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1386 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1387 let ty = ctx.input_ty(insn, 0);
1388 let (alu_op, const_value) = match op {
1389 Opcode::IsNull => {
1390 // cmp rn, #0
1391 (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
1392 }
1393 Opcode::IsInvalid => {
1394 // cmn rn, #1
1395 (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
1396 }
1397 _ => unreachable!(),
1398 };
1399 let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
1400 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
1401 materialize_bool_result(ctx, insn, rd, Cond::Eq);
1402 }
1403
1404 Opcode::Copy => {
1405 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1406 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1407 let ty = ctx.input_ty(insn, 0);
1408 ctx.emit(Inst::gen_move(rd, rn, ty));
1409 }
1410
1411 Opcode::Breduce | Opcode::Ireduce => {
1412 // Smaller integers/booleans are stored with high-order bits
1413 // undefined, so we can simply do a copy.
1414 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1415 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1416 let ty = ctx.input_ty(insn, 0);
1417 ctx.emit(Inst::gen_move(rd, rn, ty));
1418 }
1419
1420 Opcode::Bextend | Opcode::Bmask => {
1421 // Bextend and Bmask both simply sign-extend. This works for:
1422 // - Bextend, because booleans are stored as 0 / -1, so we
1423 // sign-extend the -1 to a -1 in the wider width.
1424 // - Bmask, because the resulting integer mask value must be
1425 // all-ones (-1) if the argument is true.
1426
1427 let from_ty = ctx.input_ty(insn, 0);
1428 let to_ty = ctx.output_ty(insn, 0);
1429 let from_bits = ty_bits(from_ty);
1430 let to_bits = ty_bits(to_ty);
1431
1432 assert!(
1433 from_bits <= 64 && to_bits <= 64,
1434 "Vector Bextend not supported yet"
1435 );
1436 assert!(from_bits <= to_bits);
1437
1438 if from_bits == to_bits {
1439 // Nothing.
1440 } else {
1441 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1442 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1443 let to_bits = if to_bits == 64 {
1444 64
1445 } else {
1446 assert!(to_bits <= 32);
1447 32
1448 };
1449 let from_bits = from_bits as u8;
1450 ctx.emit(Inst::Extend {
1451 rd,
1452 rn,
1453 signed: true,
1454 from_bits,
1455 to_bits,
1456 });
1457 }
1458 }
1459
1460 Opcode::Bint => {
1461 // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
1462 // out the LSB to give a 0 / 1-valued integer result.
1463 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1464 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1465 let output_bits = ty_bits(ctx.output_ty(insn, 0));
1466
1467 let (imm_ty, alu_op) = if output_bits > 32 {
1468 (I64, ALUOp::And64)
1469 } else {
1470 (I32, ALUOp::And32)
1471 };
1472 ctx.emit(Inst::AluRRImmLogic {
1473 alu_op,
1474 rd,
1475 rn,
1476 imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
1477 });
1478 }
1479
1480 Opcode::Bitcast => {
1481 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1482 let ity = ctx.input_ty(insn, 0);
1483 let oty = ctx.output_ty(insn, 0);
1484 let ity_bits = ty_bits(ity);
1485 let ity_vec_reg = ty_has_float_or_vec_representation(ity);
1486 let oty_bits = ty_bits(oty);
1487 let oty_vec_reg = ty_has_float_or_vec_representation(oty);
1488
1489 debug_assert_eq!(ity_bits, oty_bits);
1490
1491 match (ity_vec_reg, oty_vec_reg) {
1492 (true, true) => {
1493 let narrow_mode = if ity_bits <= 32 {
1494 NarrowValueMode::ZeroExtend32
1495 } else {
1496 NarrowValueMode::ZeroExtend64
1497 };
1498 let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
1499 ctx.emit(Inst::gen_move(rd, rm, oty));
1500 }
1501 (false, false) => {
1502 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1503 ctx.emit(Inst::gen_move(rd, rm, oty));
1504 }
1505 (false, true) => {
1506 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1507 ctx.emit(Inst::MovToFpu {
1508 rd,
1509 rn,
1510 size: ScalarSize::Size64,
1511 });
1512 }
1513 (true, false) => {
1514 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1515 let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
1516
1517 ctx.emit(Inst::MovFromVec {
1518 rd,
1519 rn,
1520 idx: 0,
1521 size,
1522 });
1523 }
1524 }
1525 }
1526
1527 Opcode::FallthroughReturn | Opcode::Return => {
1528 for (i, input) in inputs.iter().enumerate() {
1529 // N.B.: according to the AArch64 ABI, the top bits of a register
1530 // (above the bits for the value's type) are undefined, so we
1531 // need not extend the return values.
1532 let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
1533 let retval_reg = ctx.retval(i).only_reg().unwrap();
1534 let ty = ctx.input_ty(insn, i);
1535 ctx.emit(Inst::gen_move(retval_reg, reg, ty));
1536 }
1537 // N.B.: the Ret itself is generated by the ABI.
1538 }
1539
1540 Opcode::Ifcmp | Opcode::Ffcmp => {
1541 // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
1542 // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
1543 // the same block, or a dominating block. In other words, it cannot pass through a BB
1544 // param (phi). The flags pass of the verifier will ensure this.
1545 panic!("Should never reach ifcmp as isel root!");
1546 }
1547
1548 Opcode::Icmp => {
1549 let condcode = ctx.data(insn).cond_code().unwrap();
1550 let cond = lower_condcode(condcode);
1551 let is_signed = condcode_is_signed(condcode);
1552 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1553 let ty = ctx.input_ty(insn, 0);
1554 let bits = ty_bits(ty);
1555 let narrow_mode = match (bits <= 32, is_signed) {
1556 (true, true) => NarrowValueMode::SignExtend32,
1557 (true, false) => NarrowValueMode::ZeroExtend32,
1558 (false, true) => NarrowValueMode::SignExtend64,
1559 (false, false) => NarrowValueMode::ZeroExtend64,
1560 };
1561 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1562
1563 if !ty.is_vector() {
1564 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
1565 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
1566 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
1567 materialize_bool_result(ctx, insn, rd, cond);
1568 } else {
1569 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
1570 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1571 }
1572 }
1573
1574 Opcode::Fcmp => {
1575 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1576 let cond = lower_fp_condcode(condcode);
1577 let ty = ctx.input_ty(insn, 0);
1578 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1579 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1580 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1581
1582 if !ty.is_vector() {
1583 match ty_bits(ty) {
1584 32 => {
1585 ctx.emit(Inst::FpuCmp32 { rn, rm });
1586 }
1587 64 => {
1588 ctx.emit(Inst::FpuCmp64 { rn, rm });
1589 }
1590 _ => panic!("Bad float size"),
1591 }
1592 materialize_bool_result(ctx, insn, rd, cond);
1593 } else {
1594 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1595 }
1596 }
1597
1598 Opcode::JumpTableEntry | Opcode::JumpTableBase => {
1599 panic!("Should not appear: we handle BrTable directly");
1600 }
1601
1602 Opcode::Debugtrap => {
1603 ctx.emit(Inst::Brk);
1604 }
1605
1606 Opcode::Trap | Opcode::ResumableTrap => {
1607 let trap_code = ctx.data(insn).trap_code().unwrap();
1608 ctx.emit_safepoint(Inst::Udf { trap_code });
1609 }
1610
1611 Opcode::Trapif | Opcode::Trapff => {
1612 let trap_code = ctx.data(insn).trap_code().unwrap();
1613
1614 let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
1615 let condcode = ctx.data(insn).cond_code().unwrap();
1616 let cond = lower_condcode(condcode);
1617 // The flags must not have been clobbered by any other
1618 // instruction between the iadd_ifcout and this instruction, as
1619 // verified by the CLIF validator; so we can simply use the
1620 // flags here.
1621 cond
1622 } else if op == Opcode::Trapif {
1623 let condcode = ctx.data(insn).cond_code().unwrap();
1624 let cond = lower_condcode(condcode);
1625 let is_signed = condcode_is_signed(condcode);
1626
1627 // Verification ensures that the input is always a single-def ifcmp.
1628 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1629 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
1630 cond
1631 } else {
1632 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1633 let cond = lower_fp_condcode(condcode);
1634
1635 // Verification ensures that the input is always a
1636 // single-def ffcmp.
1637 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1638 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1639 cond
1640 };
1641
1642 ctx.emit_safepoint(Inst::TrapIf {
1643 trap_code,
1644 kind: CondBrKind::Cond(cond),
1645 });
1646 }
1647
1648 Opcode::Safepoint => {
1649 panic!("safepoint instructions not used by new backend's safepoints!");
1650 }
1651
1652 Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
1653 panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
1654 }
1655
1656 Opcode::FuncAddr => {
1657 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1658 let (extname, _) = ctx.call_target(insn).unwrap();
1659 let extname = extname.clone();
1660 ctx.emit(Inst::LoadExtName {
1661 rd,
1662 name: Box::new(extname),
1663 offset: 0,
1664 });
1665 }
1666
1667 Opcode::GlobalValue => {
1668 panic!("global_value should have been removed by legalization!");
1669 }
1670
1671 Opcode::SymbolValue => {
1672 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1673 let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
1674 let extname = extname.clone();
1675 ctx.emit(Inst::LoadExtName {
1676 rd,
1677 name: Box::new(extname),
1678 offset,
1679 });
1680 }
1681
1682 Opcode::Call | Opcode::CallIndirect => {
1683 let caller_conv = ctx.abi().call_conv();
1684 let (mut abi, inputs) = match op {
1685 Opcode::Call => {
1686 let (extname, dist) = ctx.call_target(insn).unwrap();
1687 let extname = extname.clone();
1688 let sig = ctx.call_sig(insn).unwrap();
1689 assert!(inputs.len() == sig.params.len());
1690 assert!(outputs.len() == sig.returns.len());
1691 (
1692 AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
1693 &inputs[..],
1694 )
1695 }
1696 Opcode::CallIndirect => {
1697 let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1698 let sig = ctx.call_sig(insn).unwrap();
1699 assert!(inputs.len() - 1 == sig.params.len());
1700 assert!(outputs.len() == sig.returns.len());
1701 (
1702 AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
1703 &inputs[1..],
1704 )
1705 }
1706 _ => unreachable!(),
1707 };
1708
1709 abi.emit_stack_pre_adjust(ctx);
1710 assert!(inputs.len() == abi.num_args());
1711 for i in abi.get_copy_to_arg_order() {
1712 let input = inputs[i];
1713 let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
1714 abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
1715 }
1716 abi.emit_call(ctx);
1717 for (i, output) in outputs.iter().enumerate() {
1718 let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
1719 abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
1720 }
1721 abi.emit_stack_post_adjust(ctx);
1722 }
1723
1724 Opcode::GetPinnedReg => {
1725 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1726 ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
1727 }
1728
1729 Opcode::SetPinnedReg => {
1730 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1731 ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
1732 }
1733
1734 Opcode::Spill
1735 | Opcode::Fill
1736 | Opcode::FillNop
1737 | Opcode::Regmove
1738 | Opcode::CopySpecial
1739 | Opcode::CopyToSsa
1740 | Opcode::CopyNop
1741 | Opcode::AdjustSpDown
1742 | Opcode::AdjustSpUpImm
1743 | Opcode::AdjustSpDownImm
1744 | Opcode::IfcmpSp
1745 | Opcode::Regspill
1746 | Opcode::Regfill => {
1747 panic!("Unused opcode should not be encountered.");
1748 }
1749
1750 Opcode::Jump
1751 | Opcode::Fallthrough
1752 | Opcode::Brz
1753 | Opcode::Brnz
1754 | Opcode::BrIcmp
1755 | Opcode::Brif
1756 | Opcode::Brff
1757 | Opcode::IndirectJumpTableBr
1758 | Opcode::BrTable => {
1759 panic!("Branch opcode reached non-branch lowering logic!");
1760 }
1761
1762 Opcode::Vconst => {
1763 let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
1764 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1765 lower_constant_f128(ctx, rd, value);
1766 }
1767
1768 Opcode::RawBitcast => {
1769 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1770 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1771 let ty = ctx.input_ty(insn, 0);
1772 ctx.emit(Inst::gen_move(rd, rm, ty));
1773 }
1774
1775 Opcode::Extractlane => {
1776 if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
1777 let idx = *imm;
1778 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1779 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1780 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
1781 let ty = ty.unwrap();
1782
1783 if ty_has_int_representation(ty) {
1784 ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
1785 // Plain moves are faster on some processors.
1786 } else if idx == 0 {
1787 ctx.emit(Inst::gen_move(rd, rn, ty));
1788 } else {
1789 ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
1790 }
1791 } else {
1792 unreachable!();
1793 }
1794 }
1795
1796 Opcode::Insertlane => {
1797 let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
1798 *imm
1799 } else {
1800 unreachable!();
1801 };
1802 let input_ty = ctx.input_ty(insn, 1);
1803 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1804 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1805 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1806 let ty = ty.unwrap();
1807 let size = VectorSize::from_ty(ty);
1808
1809 ctx.emit(Inst::gen_move(rd, rm, ty));
1810
1811 if ty_has_int_representation(input_ty) {
1812 ctx.emit(Inst::MovToVec { rd, rn, idx, size });
1813 } else {
1814 ctx.emit(Inst::VecMovElement {
1815 rd,
1816 rn,
1817 dest_idx: idx,
1818 src_idx: 0,
1819 size,
1820 });
1821 }
1822 }
1823
1824 Opcode::Splat => {
1825 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1826 let size = VectorSize::from_ty(ty.unwrap());
1827
1828 if let Some((_, insn)) = maybe_input_insn_multi(
1829 ctx,
1830 inputs[0],
1831 &[
1832 Opcode::Bconst,
1833 Opcode::F32const,
1834 Opcode::F64const,
1835 Opcode::Iconst,
1836 ],
1837 ) {
1838 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1839 } else if let Some(insn) =
1840 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
1841 {
1842 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1843 } else if let Some(insn) =
1844 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
1845 {
1846 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
1847 } else if let Some((_, insn)) = maybe_input_insn_multi(
1848 ctx,
1849 inputs[0],
1850 &[
1851 Opcode::Uload8,
1852 Opcode::Sload8,
1853 Opcode::Uload16,
1854 Opcode::Sload16,
1855 Opcode::Uload32,
1856 Opcode::Sload32,
1857 Opcode::Load,
1858 ],
1859 ) {
1860 ctx.sink_inst(insn);
1861 let load_inputs = insn_inputs(ctx, insn);
1862 let load_outputs = insn_outputs(ctx, insn);
1863 lower_load(
1864 ctx,
1865 insn,
1866 &load_inputs[..],
1867 load_outputs[0],
1868 |ctx, _rd, _elem_ty, mem| {
1869 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1870 let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
1871 if let Some(addr_inst) = addr_inst {
1872 ctx.emit(addr_inst);
1873 }
1874 ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
1875 },
1876 );
1877 } else {
1878 let input_ty = ctx.input_ty(insn, 0);
1879 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1880 let inst = if ty_has_int_representation(input_ty) {
1881 Inst::VecDup { rd, rn, size }
1882 } else {
1883 Inst::VecDupFromFpu { rd, rn, size }
1884 };
1885
1886 ctx.emit(inst);
1887 }
1888 }
1889
1890 Opcode::ScalarToVector => {
1891 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1892 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1893 let input_ty = ctx.input_ty(insn, 0);
1894 if (input_ty == I32 && ty.unwrap() == I32X4)
1895 || (input_ty == I64 && ty.unwrap() == I64X2)
1896 {
1897 ctx.emit(Inst::MovToFpu {
1898 rd,
1899 rn,
1900 size: ScalarSize::from_ty(input_ty),
1901 });
1902 } else {
1903 return Err(CodegenError::Unsupported(format!(
1904 "ScalarToVector: unsupported types {:?} -> {:?}",
1905 input_ty, ty
1906 )));
1907 }
1908 }
1909
1910 Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
1911 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1912 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1913 let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
1914
1915 // cmeq vtmp.2d, vm.2d, #0
1916 // addp dtmp, vtmp.2d
1917 // fcmp dtmp, dtmp
1918 // cset xd, eq
1919 //
1920 // Note that after the ADDP the value of the temporary register will
1921 // be either 0 when all input elements are true, i.e. non-zero, or a
1922 // NaN otherwise (either -1 or -2 when represented as an integer);
1923 // NaNs are the only floating-point numbers that compare unequal to
1924 // themselves.
1925
1926 ctx.emit(Inst::VecMisc {
1927 op: VecMisc2::Cmeq0,
1928 rd: tmp,
1929 rn: rm,
1930 size: VectorSize::Size64x2,
1931 });
1932 ctx.emit(Inst::VecRRPair {
1933 op: VecPairOp::Addp,
1934 rd: tmp,
1935 rn: tmp.to_reg(),
1936 });
1937 ctx.emit(Inst::FpuCmp64 {
1938 rn: tmp.to_reg(),
1939 rm: tmp.to_reg(),
1940 });
1941 materialize_bool_result(ctx, insn, rd, Cond::Eq);
1942 }
1943
1944 Opcode::VanyTrue | Opcode::VallTrue => {
1945 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1946 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1947 let src_ty = ctx.input_ty(insn, 0);
1948 let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
1949
1950 // This operation is implemented by using umaxp or uminv to
1951 // create a scalar value, which is then compared against zero.
1952 //
1953 // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
1954 // mov xm, vn.d[0]
1955 // cmp xm, #0
1956 // cset xm, ne
1957
1958 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
1959
1960 if op == Opcode::VanyTrue {
1961 ctx.emit(Inst::VecRRR {
1962 alu_op: VecALUOp::Umaxp,
1963 rd: tmp,
1964 rn: rm,
1965 rm: rm,
1966 size,
1967 });
1968 } else {
1969 ctx.emit(Inst::VecLanes {
1970 op: VecLanesOp::Uminv,
1971 rd: tmp,
1972 rn: rm,
1973 size,
1974 });
1975 };
1976
1977 ctx.emit(Inst::MovFromVec {
1978 rd,
1979 rn: tmp.to_reg(),
1980 idx: 0,
1981 size: VectorSize::Size64x2,
1982 });
1983
1984 ctx.emit(Inst::AluRRImm12 {
1985 alu_op: ALUOp::SubS64,
1986 rd: writable_zero_reg(),
1987 rn: rd.to_reg(),
1988 imm12: Imm12::zero(),
1989 });
1990
1991 materialize_bool_result(ctx, insn, rd, Cond::Ne);
1992 }
1993
1994 Opcode::VhighBits => {
1995 let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1996 let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1997 let ty = ctx.input_ty(insn, 0);
1998 // All three sequences use one integer temporary and two vector temporaries. The
1999 // shift is done early so as to give the register allocator the possibility of using
2000 // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
2001 // `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
2002 // derivation of these sequences. Alternative sequences are discussed in
2003 // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
2004 // used here.
2005 let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
2006 let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2007 let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2008 match ty {
2009 I8X16 => {
2010 // sshr tmp_v1.16b, src_v.16b, #7
2011 // mov tmp_r0, #0x0201
2012 // movk tmp_r0, #0x0804, lsl 16
2013 // movk tmp_r0, #0x2010, lsl 32
2014 // movk tmp_r0, #0x8040, lsl 48
2015 // dup tmp_v0.2d, tmp_r0
2016 // and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
2017 // ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
2018 // zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2019 // addv tmp_v0h, tmp_v0.8h
2020 // mov dst_r, tmp_v0.h[0]
2021 ctx.emit(Inst::VecShiftImm {
2022 op: VecShiftImmOp::Sshr,
2023 rd: tmp_v1,
2024 rn: src_v,
2025 size: VectorSize::Size8x16,
2026 imm: 7,
2027 });
2028 lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
2029 ctx.emit(Inst::VecRRR {
2030 alu_op: VecALUOp::And,
2031 rd: tmp_v1,
2032 rn: tmp_v1.to_reg(),
2033 rm: tmp_v0.to_reg(),
2034 size: VectorSize::Size8x16,
2035 });
2036 ctx.emit(Inst::VecExtract {
2037 rd: tmp_v0,
2038 rn: tmp_v1.to_reg(),
2039 rm: tmp_v1.to_reg(),
2040 imm4: 8,
2041 });
2042 ctx.emit(Inst::VecRRR {
2043 alu_op: VecALUOp::Zip1,
2044 rd: tmp_v0,
2045 rn: tmp_v1.to_reg(),
2046 rm: tmp_v0.to_reg(),
2047 size: VectorSize::Size8x16,
2048 });
2049 ctx.emit(Inst::VecLanes {
2050 op: VecLanesOp::Addv,
2051 rd: tmp_v0,
2052 rn: tmp_v0.to_reg(),
2053 size: VectorSize::Size16x8,
2054 });
2055 ctx.emit(Inst::MovFromVec {
2056 rd: dst_r,
2057 rn: tmp_v0.to_reg(),
2058 idx: 0,
2059 size: VectorSize::Size16x8,
2060 });
2061 }
2062 I16X8 => {
2063 // sshr tmp_v1.8h, src_v.8h, #15
2064 // mov tmp_r0, #0x1
2065 // movk tmp_r0, #0x2, lsl 16
2066 // movk tmp_r0, #0x4, lsl 32
2067 // movk tmp_r0, #0x8, lsl 48
2068 // dup tmp_v0.2d, tmp_r0
2069 // shl tmp_r0, tmp_r0, #4
2070 // mov tmp_v0.d[1], tmp_r0
2071 // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2072 // addv tmp_v0h, tmp_v0.8h
2073 // mov dst_r, tmp_v0.h[0]
2074 ctx.emit(Inst::VecShiftImm {
2075 op: VecShiftImmOp::Sshr,
2076 rd: tmp_v1,
2077 rn: src_v,
2078 size: VectorSize::Size16x8,
2079 imm: 15,
2080 });
2081 lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
2082 ctx.emit(Inst::VecDup {
2083 rd: tmp_v0,
2084 rn: tmp_r0.to_reg(),
2085 size: VectorSize::Size64x2,
2086 });
2087 ctx.emit(Inst::AluRRImmShift {
2088 alu_op: ALUOp::Lsl64,
2089 rd: tmp_r0,
2090 rn: tmp_r0.to_reg(),
2091 immshift: ImmShift { imm: 4 },
2092 });
2093 ctx.emit(Inst::MovToVec {
2094 rd: tmp_v0,
2095 rn: tmp_r0.to_reg(),
2096 idx: 1,
2097 size: VectorSize::Size64x2,
2098 });
2099 ctx.emit(Inst::VecRRR {
2100 alu_op: VecALUOp::And,
2101 rd: tmp_v0,
2102 rn: tmp_v1.to_reg(),
2103 rm: tmp_v0.to_reg(),
2104 size: VectorSize::Size8x16,
2105 });
2106 ctx.emit(Inst::VecLanes {
2107 op: VecLanesOp::Addv,
2108 rd: tmp_v0,
2109 rn: tmp_v0.to_reg(),
2110 size: VectorSize::Size16x8,
2111 });
2112 ctx.emit(Inst::MovFromVec {
2113 rd: dst_r,
2114 rn: tmp_v0.to_reg(),
2115 idx: 0,
2116 size: VectorSize::Size16x8,
2117 });
2118 }
2119 I32X4 => {
2120 // sshr tmp_v1.4s, src_v.4s, #31
2121 // mov tmp_r0, #0x1
2122 // movk tmp_r0, #0x2, lsl 32
2123 // dup tmp_v0.2d, tmp_r0
2124 // shl tmp_r0, tmp_r0, #2
2125 // mov tmp_v0.d[1], tmp_r0
2126 // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2127 // addv tmp_v0s, tmp_v0.4s
2128 // mov dst_r, tmp_v0.s[0]
2129 ctx.emit(Inst::VecShiftImm {
2130 op: VecShiftImmOp::Sshr,
2131 rd: tmp_v1,
2132 rn: src_v,
2133 size: VectorSize::Size32x4,
2134 imm: 31,
2135 });
2136 lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
2137 ctx.emit(Inst::VecDup {
2138 rd: tmp_v0,
2139 rn: tmp_r0.to_reg(),
2140 size: VectorSize::Size64x2,
2141 });
2142 ctx.emit(Inst::AluRRImmShift {
2143 alu_op: ALUOp::Lsl64,
2144 rd: tmp_r0,
2145 rn: tmp_r0.to_reg(),
2146 immshift: ImmShift { imm: 2 },
2147 });
2148 ctx.emit(Inst::MovToVec {
2149 rd: tmp_v0,
2150 rn: tmp_r0.to_reg(),
2151 idx: 1,
2152 size: VectorSize::Size64x2,
2153 });
2154 ctx.emit(Inst::VecRRR {
2155 alu_op: VecALUOp::And,
2156 rd: tmp_v0,
2157 rn: tmp_v1.to_reg(),
2158 rm: tmp_v0.to_reg(),
2159 size: VectorSize::Size8x16,
2160 });
2161 ctx.emit(Inst::VecLanes {
2162 op: VecLanesOp::Addv,
2163 rd: tmp_v0,
2164 rn: tmp_v0.to_reg(),
2165 size: VectorSize::Size32x4,
2166 });
2167 ctx.emit(Inst::MovFromVec {
2168 rd: dst_r,
2169 rn: tmp_v0.to_reg(),
2170 idx: 0,
2171 size: VectorSize::Size32x4,
2172 });
2173 }
2174 I64X2 => {
2175 // mov dst_r, src_v.d[0]
2176 // mov tmp_r0, src_v.d[1]
2177 // lsr dst_r, dst_r, #63
2178 // lsr tmp_r0, tmp_r0, #63
2179 // add dst_r, dst_r, tmp_r0, lsl #1
2180 ctx.emit(Inst::MovFromVec {
2181 rd: dst_r,
2182 rn: src_v,
2183 idx: 0,
2184 size: VectorSize::Size64x2,
2185 });
2186 ctx.emit(Inst::MovFromVec {
2187 rd: tmp_r0,
2188 rn: src_v,
2189 idx: 1,
2190 size: VectorSize::Size64x2,
2191 });
2192 ctx.emit(Inst::AluRRImmShift {
2193 alu_op: ALUOp::Lsr64,
2194 rd: dst_r,
2195 rn: dst_r.to_reg(),
2196 immshift: ImmShift::maybe_from_u64(63).unwrap(),
2197 });
2198 ctx.emit(Inst::AluRRImmShift {
2199 alu_op: ALUOp::Lsr64,
2200 rd: tmp_r0,
2201 rn: tmp_r0.to_reg(),
2202 immshift: ImmShift::maybe_from_u64(63).unwrap(),
2203 });
2204 ctx.emit(Inst::AluRRRShift {
2205 alu_op: ALUOp::Add32,
2206 rd: dst_r,
2207 rn: dst_r.to_reg(),
2208 rm: tmp_r0.to_reg(),
2209 shiftop: ShiftOpAndAmt::new(
2210 ShiftOp::LSL,
2211 ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
2212 ),
2213 });
2214 }
2215 _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
2216 }
2217 }
2218
2219 Opcode::Shuffle => {
2220 let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
2221 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2222 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2223 let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2224 // 2 register table vector lookups require consecutive table registers;
2225 // we satisfy this constraint by hardcoding the usage of v29 and v30.
2226 let temp = writable_vreg(29);
2227 let temp2 = writable_vreg(30);
2228 let input_ty = ctx.input_ty(insn, 0);
2229 assert_eq!(input_ty, ctx.input_ty(insn, 1));
2230 // Make sure that both inputs are in virtual registers, since it is
2231 // not guaranteed that we can get them safely to the temporaries if
2232 // either is in a real register.
2233 let rn = ctx.ensure_in_vreg(rn, input_ty);
2234 let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
2235
2236 lower_constant_f128(ctx, rd, mask);
2237 ctx.emit(Inst::gen_move(temp, rn, input_ty));
2238 ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
2239 ctx.emit(Inst::VecTbl2 {
2240 rd,
2241 rn: temp.to_reg(),
2242 rn2: temp2.to_reg(),
2243 rm: rd.to_reg(),
2244 is_extension: false,
2245 });
2246 }
2247
2248 Opcode::Swizzle => {
2249 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2250 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2251 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2252
2253 ctx.emit(Inst::VecTbl {
2254 rd,
2255 rn,
2256 rm,
2257 is_extension: false,
2258 });
2259 }
2260
2261 Opcode::Vsplit | Opcode::Vconcat => {
2262 // TODO
2263 panic!("Vector ops not implemented.");
2264 }
2265
2266 Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
2267
2268 Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
2269 let alu_op = match op {
2270 Opcode::Umin => VecALUOp::Umin,
2271 Opcode::Imin => VecALUOp::Smin,
2272 Opcode::Umax => VecALUOp::Umax,
2273 Opcode::Imax => VecALUOp::Smax,
2274 _ => unreachable!(),
2275 };
2276 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2277 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2278 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2279 let ty = ty.unwrap();
2280 ctx.emit(Inst::VecRRR {
2281 alu_op,
2282 rd,
2283 rn,
2284 rm,
2285 size: VectorSize::from_ty(ty),
2286 });
2287 }
2288
2289 Opcode::WideningPairwiseDotProductS => {
2290 let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2291 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2292 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2293 let ty = ty.unwrap();
2294 if ty == I32X4 {
2295 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2296 // The args have type I16X8.
2297 // "y = i32x4.dot_i16x8_s(a, b)"
2298 // => smull tmp, a, b
2299 // smull2 y, a, b
2300 // addp y, tmp, y
2301 ctx.emit(Inst::VecRRR {
2302 alu_op: VecALUOp::Smull,
2303 rd: tmp,
2304 rn: r_a,
2305 rm: r_b,
2306 size: VectorSize::Size16x8,
2307 });
2308 ctx.emit(Inst::VecRRR {
2309 alu_op: VecALUOp::Smull2,
2310 rd: r_y,
2311 rn: r_a,
2312 rm: r_b,
2313 size: VectorSize::Size16x8,
2314 });
2315 ctx.emit(Inst::VecRRR {
2316 alu_op: VecALUOp::Addp,
2317 rd: r_y,
2318 rn: tmp.to_reg(),
2319 rm: r_y.to_reg(),
2320 size: VectorSize::Size32x4,
2321 });
2322 } else {
2323 return Err(CodegenError::Unsupported(format!(
2324 "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
2325 ty
2326 )));
2327 }
2328 }
2329
2330 Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
2331 let ty = ty.unwrap();
2332 let bits = ty_bits(ty);
2333 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2334 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2335 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2336 if !ty.is_vector() {
2337 let fpu_op = match (op, bits) {
2338 (Opcode::Fadd, 32) => FPUOp2::Add32,
2339 (Opcode::Fadd, 64) => FPUOp2::Add64,
2340 (Opcode::Fsub, 32) => FPUOp2::Sub32,
2341 (Opcode::Fsub, 64) => FPUOp2::Sub64,
2342 (Opcode::Fmul, 32) => FPUOp2::Mul32,
2343 (Opcode::Fmul, 64) => FPUOp2::Mul64,
2344 (Opcode::Fdiv, 32) => FPUOp2::Div32,
2345 (Opcode::Fdiv, 64) => FPUOp2::Div64,
2346 (Opcode::Fmin, 32) => FPUOp2::Min32,
2347 (Opcode::Fmin, 64) => FPUOp2::Min64,
2348 (Opcode::Fmax, 32) => FPUOp2::Max32,
2349 (Opcode::Fmax, 64) => FPUOp2::Max64,
2350 _ => panic!("Unknown op/bits combination"),
2351 };
2352 ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
2353 } else {
2354 let alu_op = match op {
2355 Opcode::Fadd => VecALUOp::Fadd,
2356 Opcode::Fsub => VecALUOp::Fsub,
2357 Opcode::Fdiv => VecALUOp::Fdiv,
2358 Opcode::Fmax => VecALUOp::Fmax,
2359 Opcode::Fmin => VecALUOp::Fmin,
2360 Opcode::Fmul => VecALUOp::Fmul,
2361 _ => unreachable!(),
2362 };
2363
2364 ctx.emit(Inst::VecRRR {
2365 rd,
2366 rn,
2367 rm,
2368 alu_op,
2369 size: VectorSize::from_ty(ty),
2370 });
2371 }
2372 }
2373
2374 Opcode::FminPseudo | Opcode::FmaxPseudo => {
2375 let ty = ctx.input_ty(insn, 0);
2376 if ty == F32X4 || ty == F64X2 {
2377 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
2378 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
2379 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2380 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2381 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2382 // Since we're going to write the output register `r_dst` anyway, we might as
2383 // well first use it to hold the comparison result. This has the slightly unusual
2384 // effect that we modify the output register in the first instruction (`fcmgt`)
2385 // but read both the inputs again in the second instruction (`bsl`), which means
2386 // that the output register can't be either of the input registers. Regalloc
2387 // should handle this correctly, nevertheless.
2388 ctx.emit(Inst::VecRRR {
2389 alu_op: VecALUOp::Fcmgt,
2390 rd: r_dst,
2391 rn: if op == Opcode::FminPseudo { r_a } else { r_b },
2392 rm: if op == Opcode::FminPseudo { r_b } else { r_a },
2393 size: if ty == F32X4 {
2394 VectorSize::Size32x4
2395 } else {
2396 VectorSize::Size64x2
2397 },
2398 });
2399 ctx.emit(Inst::VecRRR {
2400 alu_op: VecALUOp::Bsl,
2401 rd: r_dst,
2402 rn: r_b,
2403 rm: r_a,
2404 size: VectorSize::Size8x16,
2405 });
2406 } else {
2407 panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
2408 }
2409 }
2410
2411 Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
2412 let ty = ty.unwrap();
2413 let bits = ty_bits(ty);
2414 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2415 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2416 if !ty.is_vector() {
2417 let fpu_op = match (op, bits) {
2418 (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
2419 (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
2420 (Opcode::Fneg, 32) => FPUOp1::Neg32,
2421 (Opcode::Fneg, 64) => FPUOp1::Neg64,
2422 (Opcode::Fabs, 32) => FPUOp1::Abs32,
2423 (Opcode::Fabs, 64) => FPUOp1::Abs64,
2424 (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
2425 (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
2426 (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
2427 (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
2428 _ => panic!("Unknown op/bits combination"),
2429 };
2430 ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
2431 } else {
2432 let op = match op {
2433 Opcode::Fabs => VecMisc2::Fabs,
2434 Opcode::Fneg => VecMisc2::Fneg,
2435 Opcode::Sqrt => VecMisc2::Fsqrt,
2436 _ => unimplemented!(),
2437 };
2438
2439 ctx.emit(Inst::VecMisc {
2440 op,
2441 rd,
2442 rn,
2443 size: VectorSize::from_ty(ty),
2444 });
2445 }
2446 }
2447
2448 Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
2449 let ty = ctx.output_ty(insn, 0);
2450 if !ty.is_vector() {
2451 let bits = ty_bits(ty);
2452 let op = match (op, bits) {
2453 (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2454 (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2455 (Opcode::Floor, 32) => FpuRoundMode::Minus32,
2456 (Opcode::Floor, 64) => FpuRoundMode::Minus64,
2457 (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2458 (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2459 (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2460 (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2461 _ => panic!("Unknown op/bits combination (scalar)"),
2462 };
2463 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2464 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2465 ctx.emit(Inst::FpuRound { op, rd, rn });
2466 } else {
2467 let (op, size) = match (op, ty) {
2468 (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
2469 (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
2470 (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
2471 (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
2472 (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
2473 (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
2474 (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
2475 (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
2476 _ => panic!("Unknown op/ty combination (vector){:?}", ty),
2477 };
2478 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2479 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2480 ctx.emit(Inst::VecMisc { op, rd, rn, size });
2481 }
2482 }
2483
2484 Opcode::Fma => {
2485 let bits = ty_bits(ctx.output_ty(insn, 0));
2486 let fpu_op = match bits {
2487 32 => FPUOp3::MAdd32,
2488 64 => FPUOp3::MAdd64,
2489 _ => panic!("Unknown op size"),
2490 };
2491 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2492 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2493 let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
2494 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2495 ctx.emit(Inst::FpuRRRR {
2496 fpu_op,
2497 rn,
2498 rm,
2499 ra,
2500 rd,
2501 });
2502 }
2503
2504 Opcode::Fcopysign => {
2505 // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
2506 //
2507 // This is a scalar Fcopysign.
2508 // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
2509 // In the latter case it still sets all bits except the lowest 32 to 0.
2510 //
2511 // mov vd, vn
2512 // ushr vtmp, vm, #63 / #31
2513 // sli vd, vtmp, #63 / #31
2514
2515 let ty = ctx.output_ty(insn, 0);
2516 let bits = ty_bits(ty) as u8;
2517 assert!(bits == 32 || bits == 64);
2518 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2519 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2520 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2521 let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
2522
2523 // Copy LHS to rd.
2524 ctx.emit(Inst::gen_move(rd, rn, ty));
2525
2526 // Copy the sign bit to the lowest bit in tmp.
2527 let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2528 ctx.emit(Inst::FpuRRI {
2529 fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
2530 rd: tmp,
2531 rn: rm,
2532 });
2533
2534 // Insert the bit from tmp into the sign bit of rd.
2535 let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2536 ctx.emit(Inst::FpuRRI {
2537 fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
2538 rd,
2539 rn: tmp.to_reg(),
2540 });
2541 }
2542
2543 Opcode::FcvtToUint | Opcode::FcvtToSint => {
2544 let in_bits = ty_bits(ctx.input_ty(insn, 0));
2545 let out_bits = ty_bits(ctx.output_ty(insn, 0));
2546 let signed = op == Opcode::FcvtToSint;
2547 let op = match (signed, in_bits, out_bits) {
2548 (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
2549 (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
2550 (false, 32, 64) => FpuToIntOp::F32ToU64,
2551 (true, 32, 64) => FpuToIntOp::F32ToI64,
2552 (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
2553 (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
2554 (false, 64, 64) => FpuToIntOp::F64ToU64,
2555 (true, 64, 64) => FpuToIntOp::F64ToI64,
2556 _ => panic!("Unknown input/output-bits combination"),
2557 };
2558
2559 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2560 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2561
2562 // First, check the output: it's important to carry the NaN conversion before the
2563 // in-bounds conversion, per wasm semantics.
2564
2565 // Check that the input is not a NaN.
2566 if in_bits == 32 {
2567 ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
2568 } else {
2569 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
2570 }
2571 let trap_code = TrapCode::BadConversionToInteger;
2572 ctx.emit(Inst::TrapIf {
2573 trap_code,
2574 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
2575 });
2576
2577 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2578
2579 // Check that the input is in range, with "truncate towards zero" semantics. This means
2580 // we allow values that are slightly out of range:
2581 // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
2582 // can be represented), and strictly less than INT_MAX+1 (when this can be
2583 // represented).
2584 // - for unsigned conversions, we allow values strictly greater than -1, and strictly
2585 // less than UINT_MAX+1 (when this can be represented).
2586
2587 if in_bits == 32 {
2588 // From float32.
2589 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
2590 (true, 8) => (
2591 i8::min_value() as f32 - 1.,
2592 FloatCC::GreaterThan,
2593 i8::max_value() as f32 + 1.,
2594 ),
2595 (true, 16) => (
2596 i16::min_value() as f32 - 1.,
2597 FloatCC::GreaterThan,
2598 i16::max_value() as f32 + 1.,
2599 ),
2600 (true, 32) => (
2601 i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
2602 FloatCC::GreaterThanOrEqual,
2603 i32::max_value() as f32 + 1.,
2604 ),
2605 (true, 64) => (
2606 i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
2607 FloatCC::GreaterThanOrEqual,
2608 i64::max_value() as f32 + 1.,
2609 ),
2610 (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
2611 (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
2612 (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
2613 (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
2614 _ => panic!("Unknown input/output-bits combination"),
2615 };
2616
2617 // >= low_bound
2618 lower_constant_f32(ctx, tmp, low_bound);
2619 ctx.emit(Inst::FpuCmp32 {
2620 rn,
2621 rm: tmp.to_reg(),
2622 });
2623 let trap_code = TrapCode::IntegerOverflow;
2624 ctx.emit(Inst::TrapIf {
2625 trap_code,
2626 kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
2627 });
2628
2629 // <= high_bound
2630 lower_constant_f32(ctx, tmp, high_bound);
2631 ctx.emit(Inst::FpuCmp32 {
2632 rn,
2633 rm: tmp.to_reg(),
2634 });
2635 let trap_code = TrapCode::IntegerOverflow;
2636 ctx.emit(Inst::TrapIf {
2637 trap_code,
2638 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
2639 });
2640 } else {
2641 // From float64.
2642 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
2643 (true, 8) => (
2644 i8::min_value() as f64 - 1.,
2645 FloatCC::GreaterThan,
2646 i8::max_value() as f64 + 1.,
2647 ),
2648 (true, 16) => (
2649 i16::min_value() as f64 - 1.,
2650 FloatCC::GreaterThan,
2651 i16::max_value() as f64 + 1.,
2652 ),
2653 (true, 32) => (
2654 i32::min_value() as f64 - 1.,
2655 FloatCC::GreaterThan,
2656 i32::max_value() as f64 + 1.,
2657 ),
2658 (true, 64) => (
2659 i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
2660 FloatCC::GreaterThanOrEqual,
2661 i64::max_value() as f64 + 1.,
2662 ),
2663 (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
2664 (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
2665 (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
2666 (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
2667 _ => panic!("Unknown input/output-bits combination"),
2668 };
2669
2670 // >= low_bound
2671 lower_constant_f64(ctx, tmp, low_bound);
2672 ctx.emit(Inst::FpuCmp64 {
2673 rn,
2674 rm: tmp.to_reg(),
2675 });
2676 let trap_code = TrapCode::IntegerOverflow;
2677 ctx.emit(Inst::TrapIf {
2678 trap_code,
2679 kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
2680 });
2681
2682 // <= high_bound
2683 lower_constant_f64(ctx, tmp, high_bound);
2684 ctx.emit(Inst::FpuCmp64 {
2685 rn,
2686 rm: tmp.to_reg(),
2687 });
2688 let trap_code = TrapCode::IntegerOverflow;
2689 ctx.emit(Inst::TrapIf {
2690 trap_code,
2691 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
2692 });
2693 };
2694
2695 // Do the conversion.
2696 ctx.emit(Inst::FpuToInt { op, rd, rn });
2697 }
2698
2699 Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
2700 let ty = ty.unwrap();
2701 let signed = op == Opcode::FcvtFromSint;
2702 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2703
2704 if ty.is_vector() {
2705 let op = if signed {
2706 VecMisc2::Scvtf
2707 } else {
2708 VecMisc2::Ucvtf
2709 };
2710 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2711
2712 ctx.emit(Inst::VecMisc {
2713 op,
2714 rd,
2715 rn,
2716 size: VectorSize::from_ty(ty),
2717 });
2718 } else {
2719 let in_bits = ty_bits(ctx.input_ty(insn, 0));
2720 let out_bits = ty_bits(ty);
2721 let op = match (signed, in_bits, out_bits) {
2722 (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
2723 (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
2724 (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
2725 (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
2726 (false, 64, 32) => IntToFpuOp::U64ToF32,
2727 (true, 64, 32) => IntToFpuOp::I64ToF32,
2728 (false, 64, 64) => IntToFpuOp::U64ToF64,
2729 (true, 64, 64) => IntToFpuOp::I64ToF64,
2730 _ => panic!("Unknown input/output-bits combination"),
2731 };
2732 let narrow_mode = match (signed, in_bits) {
2733 (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
2734 (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
2735 (false, 64) => NarrowValueMode::ZeroExtend64,
2736 (true, 64) => NarrowValueMode::SignExtend64,
2737 _ => panic!("Unknown input size"),
2738 };
2739 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
2740 ctx.emit(Inst::IntToFpu { op, rd, rn });
2741 }
2742 }
2743
2744 Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
2745 let ty = ty.unwrap();
2746 let out_signed = op == Opcode::FcvtToSintSat;
2747 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2748 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2749
2750 if ty.is_vector() {
2751 let op = if out_signed {
2752 VecMisc2::Fcvtzs
2753 } else {
2754 VecMisc2::Fcvtzu
2755 };
2756
2757 ctx.emit(Inst::VecMisc {
2758 op,
2759 rd,
2760 rn,
2761 size: VectorSize::from_ty(ty),
2762 });
2763 } else {
2764 let in_ty = ctx.input_ty(insn, 0);
2765 let in_bits = ty_bits(in_ty);
2766 let out_bits = ty_bits(ty);
2767 // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
2768 // FMIN Vtmp2, Vin, Vtmp1
2769 // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
2770 // FMAX Vtmp2, Vtmp2, Vtmp1
2771 // (if signed) FIMM Vtmp1, 0
2772 // FCMP Vin, Vin
2773 // FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0
2774 // convert Rout, Vtmp2
2775
2776 assert!(in_bits == 32 || in_bits == 64);
2777 assert!(out_bits == 32 || out_bits == 64);
2778
2779 let min: f64 = match (out_bits, out_signed) {
2780 (32, true) => std::i32::MIN as f64,
2781 (32, false) => 0.0,
2782 (64, true) => std::i64::MIN as f64,
2783 (64, false) => 0.0,
2784 _ => unreachable!(),
2785 };
2786
2787 let max = match (out_bits, out_signed) {
2788 (32, true) => std::i32::MAX as f64,
2789 (32, false) => std::u32::MAX as f64,
2790 (64, true) => std::i64::MAX as f64,
2791 (64, false) => std::u64::MAX as f64,
2792 _ => unreachable!(),
2793 };
2794
2795 let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
2796 let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
2797
2798 if in_bits == 32 {
2799 lower_constant_f32(ctx, rtmp1, max as f32);
2800 } else {
2801 lower_constant_f64(ctx, rtmp1, max);
2802 }
2803 ctx.emit(Inst::FpuRRR {
2804 fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
2805 rd: rtmp2,
2806 rn: rn,
2807 rm: rtmp1.to_reg(),
2808 });
2809 if in_bits == 32 {
2810 lower_constant_f32(ctx, rtmp1, min as f32);
2811 } else {
2812 lower_constant_f64(ctx, rtmp1, min);
2813 }
2814 ctx.emit(Inst::FpuRRR {
2815 fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
2816 rd: rtmp2,
2817 rn: rtmp2.to_reg(),
2818 rm: rtmp1.to_reg(),
2819 });
2820 if out_signed {
2821 if in_bits == 32 {
2822 lower_constant_f32(ctx, rtmp1, 0.0);
2823 } else {
2824 lower_constant_f64(ctx, rtmp1, 0.0);
2825 }
2826 }
2827 if in_bits == 32 {
2828 ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
2829 ctx.emit(Inst::FpuCSel32 {
2830 rd: rtmp2,
2831 rn: rtmp1.to_reg(),
2832 rm: rtmp2.to_reg(),
2833 cond: Cond::Ne,
2834 });
2835 } else {
2836 ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
2837 ctx.emit(Inst::FpuCSel64 {
2838 rd: rtmp2,
2839 rn: rtmp1.to_reg(),
2840 rm: rtmp2.to_reg(),
2841 cond: Cond::Ne,
2842 });
2843 }
2844
2845 let cvt = match (in_bits, out_bits, out_signed) {
2846 (32, 32, false) => FpuToIntOp::F32ToU32,
2847 (32, 32, true) => FpuToIntOp::F32ToI32,
2848 (32, 64, false) => FpuToIntOp::F32ToU64,
2849 (32, 64, true) => FpuToIntOp::F32ToI64,
2850 (64, 32, false) => FpuToIntOp::F64ToU32,
2851 (64, 32, true) => FpuToIntOp::F64ToI32,
2852 (64, 64, false) => FpuToIntOp::F64ToU64,
2853 (64, 64, true) => FpuToIntOp::F64ToI64,
2854 _ => unreachable!(),
2855 };
2856 ctx.emit(Inst::FpuToInt {
2857 op: cvt,
2858 rd,
2859 rn: rtmp2.to_reg(),
2860 });
2861 }
2862 }
2863
2864 Opcode::IaddIfcout => {
2865 // This is a two-output instruction that is needed for the
2866 // legalizer's explicit heap-check sequence, among possible other
2867 // uses. Its second output is a flags output only ever meant to
2868 // check for overflow using the
2869 // `backend.unsigned_add_overflow_condition()` condition.
2870 //
2871 // Note that the CLIF validation will ensure that no flag-setting
2872 // operation comes between this IaddIfcout and its use (e.g., a
2873 // Trapif). Thus, we can rely on implicit communication through the
2874 // processor flags rather than explicitly generating flags into a
2875 // register. We simply use the variant of the add instruction that
2876 // sets flags (`adds`) here.
2877
2878 // Note that the second output (the flags) need not be generated,
2879 // because flags are never materialized into a register; the only
2880 // instructions that can use a value of type `iflags` or `fflags`
2881 // will look directly for the flags-producing instruction (which can
2882 // always be found, by construction) and merge it.
2883
2884 // Now handle the iadd as above, except use an AddS opcode that sets
2885 // flags.
2886 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2887 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2888 let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
2889 let ty = ty.unwrap();
2890 let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
2891 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
2892 }
2893
2894 Opcode::IaddImm
2895 | Opcode::ImulImm
2896 | Opcode::UdivImm
2897 | Opcode::SdivImm
2898 | Opcode::UremImm
2899 | Opcode::SremImm
2900 | Opcode::IrsubImm
2901 | Opcode::IaddCin
2902 | Opcode::IaddIfcin
2903 | Opcode::IaddCout
2904 | Opcode::IaddCarry
2905 | Opcode::IaddIfcarry
2906 | Opcode::IsubBin
2907 | Opcode::IsubIfbin
2908 | Opcode::IsubBout
2909 | Opcode::IsubIfbout
2910 | Opcode::IsubBorrow
2911 | Opcode::IsubIfborrow
2912 | Opcode::BandImm
2913 | Opcode::BorImm
2914 | Opcode::BxorImm
2915 | Opcode::RotlImm
2916 | Opcode::RotrImm
2917 | Opcode::IshlImm
2918 | Opcode::UshrImm
2919 | Opcode::SshrImm
2920 | Opcode::IcmpImm
2921 | Opcode::IfcmpImm => {
2922 panic!("ALU+imm and ALU+carry ops should not appear here!");
2923 }
2924
2925 #[cfg(feature = "x86")]
2926 Opcode::X86Udivmodx
2927 | Opcode::X86Sdivmodx
2928 | Opcode::X86Umulx
2929 | Opcode::X86Smulx
2930 | Opcode::X86Cvtt2si
2931 | Opcode::X86Fmin
2932 | Opcode::X86Fmax
2933 | Opcode::X86Push
2934 | Opcode::X86Pop
2935 | Opcode::X86Bsr
2936 | Opcode::X86Bsf
2937 | Opcode::X86Pblendw
2938 | Opcode::X86Pshufd
2939 | Opcode::X86Pshufb
2940 | Opcode::X86Pextr
2941 | Opcode::X86Pinsr
2942 | Opcode::X86Insertps
2943 | Opcode::X86Movsd
2944 | Opcode::X86Movlhps
2945 | Opcode::X86Palignr
2946 | Opcode::X86Psll
2947 | Opcode::X86Psrl
2948 | Opcode::X86Psra
2949 | Opcode::X86Ptest
2950 | Opcode::X86Pmaxs
2951 | Opcode::X86Pmaxu
2952 | Opcode::X86Pmins
2953 | Opcode::X86Pminu
2954 | Opcode::X86Pmullq
2955 | Opcode::X86Pmuludq
2956 | Opcode::X86Punpckh
2957 | Opcode::X86Punpckl
2958 | Opcode::X86Vcvtudq2ps
2959 | Opcode::X86ElfTlsGetAddr
2960 | Opcode::X86MachoTlsGetAddr => {
2961 panic!("x86-specific opcode in supposedly arch-neutral IR!");
2962 }
2963
2964 Opcode::DummySargT => unreachable!(),
2965
2966 Opcode::Iabs => {
2967 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2968 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2969 let ty = ty.unwrap();
2970 ctx.emit(Inst::VecMisc {
2971 op: VecMisc2::Abs,
2972 rd,
2973 rn,
2974 size: VectorSize::from_ty(ty),
2975 });
2976 }
2977 Opcode::AvgRound => {
2978 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2979 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2980 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2981 let ty = ty.unwrap();
2982 ctx.emit(Inst::VecRRR {
2983 alu_op: VecALUOp::Urhadd,
2984 rd,
2985 rn,
2986 rm,
2987 size: VectorSize::from_ty(ty),
2988 });
2989 }
2990
2991 Opcode::Snarrow | Opcode::Unarrow => {
2992 let op = if op == Opcode::Snarrow {
2993 VecMiscNarrowOp::Sqxtn
2994 } else {
2995 VecMiscNarrowOp::Sqxtun
2996 };
2997 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2998 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2999 let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3000 let ty = ty.unwrap();
3001
3002 ctx.emit(Inst::VecMiscNarrow {
3003 op,
3004 rd,
3005 rn,
3006 size: VectorSize::from_ty(ty),
3007 high_half: false,
3008 });
3009 ctx.emit(Inst::VecMiscNarrow {
3010 op,
3011 rd,
3012 rn: rn2,
3013 size: VectorSize::from_ty(ty),
3014 high_half: true,
3015 });
3016 }
3017
3018 Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
3019 let lane_type = ty.unwrap().lane_type();
3020 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3021 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3022 let (t, high_half) = match (lane_type, op) {
3023 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
3024 (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
3025 (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
3026 (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
3027 (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
3028 (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
3029 (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
3030 (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
3031 _ => {
3032 return Err(CodegenError::Unsupported(format!(
3033 "Unsupported SIMD vector lane type: {:?}",
3034 lane_type
3035 )));
3036 }
3037 };
3038
3039 ctx.emit(Inst::VecExtend {
3040 t,
3041 rd,
3042 rn,
3043 high_half,
3044 });
3045 }
3046
3047 Opcode::TlsValue => unimplemented!("tls_value"),
3048 Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
3049 }
3050
3051 Ok(())
3052 }
3053
lower_branch<C: LowerCtx<I = Inst>>( ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>3054 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
3055 ctx: &mut C,
3056 branches: &[IRInst],
3057 targets: &[MachLabel],
3058 ) -> CodegenResult<()> {
3059 // A block should end with at most two branches. The first may be a
3060 // conditional branch; a conditional branch can be followed only by an
3061 // unconditional branch or fallthrough. Otherwise, if only one branch,
3062 // it may be an unconditional branch, a fallthrough, a return, or a
3063 // trap. These conditions are verified by `is_ebb_basic()` during the
3064 // verifier pass.
3065 assert!(branches.len() <= 2);
3066
3067 if branches.len() == 2 {
3068 // Must be a conditional branch followed by an unconditional branch.
3069 let op0 = ctx.data(branches[0]).opcode();
3070 let op1 = ctx.data(branches[1]).opcode();
3071
3072 assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
3073 let taken = BranchTarget::Label(targets[0]);
3074 // not_taken target is the target of the second branch, even if it is a Fallthrough
3075 // instruction: because we reorder blocks while we lower, the fallthrough in the new
3076 // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
3077 // explicitly-provided target.
3078 let not_taken = BranchTarget::Label(targets[1]);
3079
3080 match op0 {
3081 Opcode::Brz | Opcode::Brnz => {
3082 let flag_input = InsnInput {
3083 insn: branches[0],
3084 input: 0,
3085 };
3086 if let Some(icmp_insn) =
3087 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
3088 {
3089 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
3090 let cond = lower_condcode(condcode);
3091 let is_signed = condcode_is_signed(condcode);
3092 let negated = op0 == Opcode::Brz;
3093 let cond = if negated { cond.invert() } else { cond };
3094
3095 lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
3096 ctx.emit(Inst::CondBr {
3097 taken,
3098 not_taken,
3099 kind: CondBrKind::Cond(cond),
3100 });
3101 } else if let Some(fcmp_insn) =
3102 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
3103 {
3104 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
3105 let cond = lower_fp_condcode(condcode);
3106 let negated = op0 == Opcode::Brz;
3107 let cond = if negated { cond.invert() } else { cond };
3108
3109 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
3110 ctx.emit(Inst::CondBr {
3111 taken,
3112 not_taken,
3113 kind: CondBrKind::Cond(cond),
3114 });
3115 } else {
3116 let rt = put_input_in_reg(
3117 ctx,
3118 InsnInput {
3119 insn: branches[0],
3120 input: 0,
3121 },
3122 NarrowValueMode::ZeroExtend64,
3123 );
3124 let kind = match op0 {
3125 Opcode::Brz => CondBrKind::Zero(rt),
3126 Opcode::Brnz => CondBrKind::NotZero(rt),
3127 _ => unreachable!(),
3128 };
3129 ctx.emit(Inst::CondBr {
3130 taken,
3131 not_taken,
3132 kind,
3133 });
3134 }
3135 }
3136 Opcode::BrIcmp => {
3137 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3138 let cond = lower_condcode(condcode);
3139 let kind = CondBrKind::Cond(cond);
3140
3141 let is_signed = condcode_is_signed(condcode);
3142 let ty = ctx.input_ty(branches[0], 0);
3143 let bits = ty_bits(ty);
3144 let narrow_mode = match (bits <= 32, is_signed) {
3145 (true, true) => NarrowValueMode::SignExtend32,
3146 (true, false) => NarrowValueMode::ZeroExtend32,
3147 (false, true) => NarrowValueMode::SignExtend64,
3148 (false, false) => NarrowValueMode::ZeroExtend64,
3149 };
3150 let rn = put_input_in_reg(
3151 ctx,
3152 InsnInput {
3153 insn: branches[0],
3154 input: 0,
3155 },
3156 narrow_mode,
3157 );
3158 let rm = put_input_in_rse_imm12(
3159 ctx,
3160 InsnInput {
3161 insn: branches[0],
3162 input: 1,
3163 },
3164 narrow_mode,
3165 );
3166
3167 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
3168 let rd = writable_zero_reg();
3169 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
3170 ctx.emit(Inst::CondBr {
3171 taken,
3172 not_taken,
3173 kind,
3174 });
3175 }
3176
3177 Opcode::Brif => {
3178 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3179 let cond = lower_condcode(condcode);
3180 let kind = CondBrKind::Cond(cond);
3181
3182 let is_signed = condcode_is_signed(condcode);
3183 let flag_input = InsnInput {
3184 insn: branches[0],
3185 input: 0,
3186 };
3187 if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
3188 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
3189 ctx.emit(Inst::CondBr {
3190 taken,
3191 not_taken,
3192 kind,
3193 });
3194 } else {
3195 // If the ifcmp result is actually placed in a
3196 // register, we need to move it back into the flags.
3197 let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3198 ctx.emit(Inst::MovToNZCV { rn });
3199 ctx.emit(Inst::CondBr {
3200 taken,
3201 not_taken,
3202 kind,
3203 });
3204 }
3205 }
3206
3207 Opcode::Brff => {
3208 let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
3209 let cond = lower_fp_condcode(condcode);
3210 let kind = CondBrKind::Cond(cond);
3211 let flag_input = InsnInput {
3212 insn: branches[0],
3213 input: 0,
3214 };
3215 if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
3216 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
3217 ctx.emit(Inst::CondBr {
3218 taken,
3219 not_taken,
3220 kind,
3221 });
3222 } else {
3223 // If the ffcmp result is actually placed in a
3224 // register, we need to move it back into the flags.
3225 let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3226 ctx.emit(Inst::MovToNZCV { rn });
3227 ctx.emit(Inst::CondBr {
3228 taken,
3229 not_taken,
3230 kind,
3231 });
3232 }
3233 }
3234
3235 _ => unimplemented!(),
3236 }
3237 } else {
3238 // Must be an unconditional branch or an indirect branch.
3239 let op = ctx.data(branches[0]).opcode();
3240 match op {
3241 Opcode::Jump | Opcode::Fallthrough => {
3242 assert!(branches.len() == 1);
3243 // In the Fallthrough case, the machine-independent driver
3244 // fills in `targets[0]` with our fallthrough block, so this
3245 // is valid for both Jump and Fallthrough.
3246 ctx.emit(Inst::Jump {
3247 dest: BranchTarget::Label(targets[0]),
3248 });
3249 }
3250
3251 Opcode::BrTable => {
3252 // Expand `br_table index, default, JT` to:
3253 //
3254 // emit_island // this forces an island at this point
3255 // // if the jumptable would push us past
3256 // // the deadline
3257 // subs idx, #jt_size
3258 // b.hs default
3259 // adr vTmp1, PC+16
3260 // ldr vTmp2, [vTmp1, idx, lsl #2]
3261 // add vTmp2, vTmp2, vTmp1
3262 // br vTmp2
3263 // [jumptable offsets relative to JT base]
3264 let jt_size = targets.len() - 1;
3265 assert!(jt_size <= std::u32::MAX as usize);
3266
3267 ctx.emit(Inst::EmitIsland {
3268 needed_space: 4 * (6 + jt_size) as CodeOffset,
3269 });
3270
3271 let ridx = put_input_in_reg(
3272 ctx,
3273 InsnInput {
3274 insn: branches[0],
3275 input: 0,
3276 },
3277 NarrowValueMode::ZeroExtend32,
3278 );
3279
3280 let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
3281 let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
3282
3283 // Bounds-check, leaving condition codes for JTSequence's
3284 // branch to default target below.
3285 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
3286 ctx.emit(Inst::AluRRImm12 {
3287 alu_op: ALUOp::SubS32,
3288 rd: writable_zero_reg(),
3289 rn: ridx,
3290 imm12,
3291 });
3292 } else {
3293 lower_constant_u64(ctx, rtmp1, jt_size as u64);
3294 ctx.emit(Inst::AluRRR {
3295 alu_op: ALUOp::SubS32,
3296 rd: writable_zero_reg(),
3297 rn: ridx,
3298 rm: rtmp1.to_reg(),
3299 });
3300 }
3301
3302 // Emit the compound instruction that does:
3303 //
3304 // b.hs default
3305 // adr rA, jt
3306 // ldrsw rB, [rA, rIndex, UXTW 2]
3307 // add rA, rA, rB
3308 // br rA
3309 // [jt entries]
3310 //
3311 // This must be *one* instruction in the vcode because
3312 // we cannot allow regalloc to insert any spills/fills
3313 // in the middle of the sequence; otherwise, the ADR's
3314 // PC-rel offset to the jumptable would be incorrect.
3315 // (The alternative is to introduce a relocation pass
3316 // for inlined jumptables, which is much worse, IMHO.)
3317
3318 let jt_targets: Vec<BranchTarget> = targets
3319 .iter()
3320 .skip(1)
3321 .map(|bix| BranchTarget::Label(*bix))
3322 .collect();
3323 let default_target = BranchTarget::Label(targets[0]);
3324 let targets_for_term: Vec<MachLabel> = targets.to_vec();
3325 ctx.emit(Inst::JTSequence {
3326 ridx,
3327 rtmp1,
3328 rtmp2,
3329 info: Box::new(JTSequenceInfo {
3330 targets: jt_targets,
3331 default_target,
3332 targets_for_term,
3333 }),
3334 });
3335 }
3336
3337 _ => panic!("Unknown branch type!"),
3338 }
3339 }
3340
3341 Ok(())
3342 }
3343