1 //! Lower a single Cranelift instruction into vcode.
2
3 use crate::binemit::CodeOffset;
4 use crate::ir::condcodes::FloatCC;
5 use crate::ir::types::*;
6 use crate::ir::Inst as IRInst;
7 use crate::ir::{InstructionData, Opcode, TrapCode};
8 use crate::isa::aarch64::settings as aarch64_settings;
9 use crate::machinst::lower::*;
10 use crate::machinst::*;
11 use crate::settings::{Flags, TlsModel};
12 use crate::{CodegenError, CodegenResult};
13
14 use crate::isa::aarch64::abi::*;
15 use crate::isa::aarch64::inst::*;
16
17 use regalloc::Writable;
18
19 use alloc::boxed::Box;
20 use alloc::vec::Vec;
21 use core::convert::TryFrom;
22
23 use super::lower::*;
24
25 /// Actually codegen an instruction's results into registers.
lower_insn_to_regs<C: LowerCtx<I = Inst>>( ctx: &mut C, insn: IRInst, flags: &Flags, isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()>26 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
27 ctx: &mut C,
28 insn: IRInst,
29 flags: &Flags,
30 isa_flags: &aarch64_settings::Flags,
31 ) -> CodegenResult<()> {
32 let op = ctx.data(insn).opcode();
33 let inputs = insn_inputs(ctx, insn);
34 let outputs = insn_outputs(ctx, insn);
35 let ty = if outputs.len() > 0 {
36 Some(ctx.output_ty(insn, 0))
37 } else {
38 None
39 };
40
41 match op {
42 Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
43 let value = ctx.get_constant(insn).unwrap();
44 // Sign extend constant if necessary
45 let value = match ty.unwrap() {
46 I8 => (((value as i64) << 56) >> 56) as u64,
47 I16 => (((value as i64) << 48) >> 48) as u64,
48 I32 => (((value as i64) << 32) >> 32) as u64,
49 I64 | R64 => value,
50 ty if ty.is_bool() => value,
51 ty => unreachable!("Unknown type for const: {}", ty),
52 };
53 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
54 lower_constant_u64(ctx, rd, value);
55 }
56 Opcode::F32const => {
57 let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
58 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
59 lower_constant_f32(ctx, rd, value);
60 }
61 Opcode::F64const => {
62 let value = f64::from_bits(ctx.get_constant(insn).unwrap());
63 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
64 lower_constant_f64(ctx, rd, value);
65 }
66 Opcode::Iadd => {
67 match ty.unwrap() {
68 ty if ty.is_vector() => {
69 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
70 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
71 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
72 ctx.emit(Inst::VecRRR {
73 rd,
74 rn,
75 rm,
76 alu_op: VecALUOp::Add,
77 size: VectorSize::from_ty(ty),
78 });
79 }
80 I128 => {
81 let lhs = put_input_in_regs(ctx, inputs[0]);
82 let rhs = put_input_in_regs(ctx, inputs[1]);
83 let dst = get_output_reg(ctx, outputs[0]);
84 assert_eq!(lhs.len(), 2);
85 assert_eq!(rhs.len(), 2);
86 assert_eq!(dst.len(), 2);
87
88 // adds x0, x0, x2
89 // adc x1, x1, x3
90
91 ctx.emit(Inst::AluRRR {
92 alu_op: ALUOp::AddS64,
93 rd: dst.regs()[0],
94 rn: lhs.regs()[0],
95 rm: rhs.regs()[0],
96 });
97 ctx.emit(Inst::AluRRR {
98 alu_op: ALUOp::Adc64,
99 rd: dst.regs()[1],
100 rn: lhs.regs()[1],
101 rm: rhs.regs()[1],
102 });
103 }
104 ty => {
105 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
106 let mul_insn = if let Some(mul_insn) =
107 maybe_input_insn(ctx, inputs[1], Opcode::Imul)
108 {
109 Some((mul_insn, 0))
110 } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
111 Some((mul_insn, 1))
112 } else {
113 None
114 };
115 // If possible combine mul + add into madd.
116 if let Some((insn, addend_idx)) = mul_insn {
117 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
118 let rn_input = InsnInput { insn, input: 0 };
119 let rm_input = InsnInput { insn, input: 1 };
120
121 let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
122 let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
123 let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
124
125 ctx.emit(Inst::AluRRRR {
126 alu_op,
127 rd,
128 rn,
129 rm,
130 ra,
131 });
132 } else {
133 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
134 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
135 ctx,
136 inputs[1],
137 ty_bits(ty),
138 NarrowValueMode::None,
139 );
140 let alu_op = if !negated {
141 choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
142 } else {
143 choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
144 };
145 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
146 }
147 }
148 }
149 }
150 Opcode::Isub => {
151 let ty = ty.unwrap();
152 if ty == I128 {
153 let lhs = put_input_in_regs(ctx, inputs[0]);
154 let rhs = put_input_in_regs(ctx, inputs[1]);
155 let dst = get_output_reg(ctx, outputs[0]);
156 assert_eq!(lhs.len(), 2);
157 assert_eq!(rhs.len(), 2);
158 assert_eq!(dst.len(), 2);
159
160 // subs x0, x0, x2
161 // sbc x1, x1, x3
162
163 ctx.emit(Inst::AluRRR {
164 alu_op: ALUOp::SubS64,
165 rd: dst.regs()[0],
166 rn: lhs.regs()[0],
167 rm: rhs.regs()[0],
168 });
169 ctx.emit(Inst::AluRRR {
170 alu_op: ALUOp::Sbc64,
171 rd: dst.regs()[1],
172 rn: lhs.regs()[1],
173 rm: rhs.regs()[1],
174 });
175 } else {
176 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
177 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
178 if !ty.is_vector() {
179 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
180 ctx,
181 inputs[1],
182 ty_bits(ty),
183 NarrowValueMode::None,
184 );
185 let alu_op = if !negated {
186 choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
187 } else {
188 choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
189 };
190 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
191 } else {
192 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
193 ctx.emit(Inst::VecRRR {
194 rd,
195 rn,
196 rm,
197 alu_op: VecALUOp::Sub,
198 size: VectorSize::from_ty(ty),
199 });
200 }
201 }
202 }
203 Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
204 let ty = ty.unwrap();
205 assert!(ty.is_vector());
206 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
207 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
208 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
209
210 let alu_op = match op {
211 Opcode::UaddSat => VecALUOp::Uqadd,
212 Opcode::SaddSat => VecALUOp::Sqadd,
213 Opcode::UsubSat => VecALUOp::Uqsub,
214 Opcode::SsubSat => VecALUOp::Sqsub,
215 _ => unreachable!(),
216 };
217
218 ctx.emit(Inst::VecRRR {
219 rd,
220 rn,
221 rm,
222 alu_op,
223 size: VectorSize::from_ty(ty),
224 });
225 }
226
227 Opcode::Ineg => {
228 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
229 let ty = ty.unwrap();
230 if !ty.is_vector() {
231 let rn = zero_reg();
232 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
233 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
234 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
235 } else {
236 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
237 ctx.emit(Inst::VecMisc {
238 op: VecMisc2::Neg,
239 rd,
240 rn,
241 size: VectorSize::from_ty(ty),
242 });
243 }
244 }
245
246 Opcode::Imul => {
247 let ty = ty.unwrap();
248 if ty == I128 {
249 let lhs = put_input_in_regs(ctx, inputs[0]);
250 let rhs = put_input_in_regs(ctx, inputs[1]);
251 let dst = get_output_reg(ctx, outputs[0]);
252 assert_eq!(lhs.len(), 2);
253 assert_eq!(rhs.len(), 2);
254 assert_eq!(dst.len(), 2);
255
256 // 128bit mul formula:
257 // dst_lo = lhs_lo * rhs_lo
258 // dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
259 //
260 // We can convert the above formula into the following
261 // umulh dst_hi, lhs_lo, rhs_lo
262 // madd dst_hi, lhs_lo, rhs_hi, dst_hi
263 // madd dst_hi, lhs_hi, rhs_lo, dst_hi
264 // mul dst_lo, lhs_lo, rhs_lo
265
266 ctx.emit(Inst::AluRRR {
267 alu_op: ALUOp::UMulH,
268 rd: dst.regs()[1],
269 rn: lhs.regs()[0],
270 rm: rhs.regs()[0],
271 });
272 ctx.emit(Inst::AluRRRR {
273 alu_op: ALUOp3::MAdd64,
274 rd: dst.regs()[1],
275 rn: lhs.regs()[0],
276 rm: rhs.regs()[1],
277 ra: dst.regs()[1].to_reg(),
278 });
279 ctx.emit(Inst::AluRRRR {
280 alu_op: ALUOp3::MAdd64,
281 rd: dst.regs()[1],
282 rn: lhs.regs()[1],
283 rm: rhs.regs()[0],
284 ra: dst.regs()[1].to_reg(),
285 });
286 ctx.emit(Inst::AluRRRR {
287 alu_op: ALUOp3::MAdd64,
288 rd: dst.regs()[0],
289 rn: lhs.regs()[0],
290 rm: rhs.regs()[0],
291 ra: zero_reg(),
292 });
293 } else if ty.is_vector() {
294 for ext_op in &[
295 Opcode::SwidenLow,
296 Opcode::SwidenHigh,
297 Opcode::UwidenLow,
298 Opcode::UwidenHigh,
299 ] {
300 if let Some((alu_op, rn, rm, high_half)) =
301 match_vec_long_mul(ctx, insn, *ext_op)
302 {
303 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
304 ctx.emit(Inst::VecRRRLong {
305 alu_op,
306 rd,
307 rn,
308 rm,
309 high_half,
310 });
311 return Ok(());
312 }
313 }
314 if ty == I64X2 {
315 lower_i64x2_mul(ctx, insn);
316 } else {
317 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
318 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
319 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
320 ctx.emit(Inst::VecRRR {
321 alu_op: VecALUOp::Mul,
322 rd,
323 rn,
324 rm,
325 size: VectorSize::from_ty(ty),
326 });
327 }
328 } else {
329 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
330 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
331 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
332 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
333 ctx.emit(Inst::AluRRRR {
334 alu_op,
335 rd,
336 rn,
337 rm,
338 ra: zero_reg(),
339 });
340 }
341 }
342
343 Opcode::Umulhi | Opcode::Smulhi => {
344 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
345 let is_signed = op == Opcode::Smulhi;
346 let input_ty = ctx.input_ty(insn, 0);
347 assert!(ctx.input_ty(insn, 1) == input_ty);
348 assert!(ctx.output_ty(insn, 0) == input_ty);
349
350 match input_ty {
351 I64 => {
352 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
353 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
354 let alu_op = if is_signed {
355 ALUOp::SMulH
356 } else {
357 ALUOp::UMulH
358 };
359 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
360 }
361 I32 | I16 | I8 => {
362 let narrow_mode = if is_signed {
363 NarrowValueMode::SignExtend64
364 } else {
365 NarrowValueMode::ZeroExtend64
366 };
367 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
368 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
369 let ra = zero_reg();
370 ctx.emit(Inst::AluRRRR {
371 alu_op: ALUOp3::MAdd64,
372 rd,
373 rn,
374 rm,
375 ra,
376 });
377 let shift_op = if is_signed {
378 ALUOp::Asr64
379 } else {
380 ALUOp::Lsr64
381 };
382 let shift_amt = match input_ty {
383 I32 => 32,
384 I16 => 16,
385 I8 => 8,
386 _ => unreachable!(),
387 };
388 ctx.emit(Inst::AluRRImmShift {
389 alu_op: shift_op,
390 rd,
391 rn: rd.to_reg(),
392 immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
393 });
394 }
395 _ => {
396 panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
397 }
398 }
399 }
400
401 Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
402 let is_signed = match op {
403 Opcode::Udiv | Opcode::Urem => false,
404 Opcode::Sdiv | Opcode::Srem => true,
405 _ => unreachable!(),
406 };
407 let is_rem = match op {
408 Opcode::Udiv | Opcode::Sdiv => false,
409 Opcode::Urem | Opcode::Srem => true,
410 _ => unreachable!(),
411 };
412 let narrow_mode = if is_signed {
413 NarrowValueMode::SignExtend64
414 } else {
415 NarrowValueMode::ZeroExtend64
416 };
417 // TODO: Add SDiv32 to implement 32-bit directly, rather
418 // than extending the input.
419 let div_op = if is_signed {
420 ALUOp::SDiv64
421 } else {
422 ALUOp::UDiv64
423 };
424
425 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
426 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
427 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
428 // The div instruction does not trap on divide by zero or signed overflow
429 // so checks are inserted below.
430 //
431 // div rd, rn, rm
432 ctx.emit(Inst::AluRRR {
433 alu_op: div_op,
434 rd,
435 rn,
436 rm,
437 });
438
439 if is_rem {
440 // Remainder (rn % rm) is implemented as:
441 //
442 // tmp = rn / rm
443 // rd = rn - (tmp*rm)
444 //
445 // use 'rd' for tmp and you have:
446 //
447 // div rd, rn, rm ; rd = rn / rm
448 // cbnz rm, #8 ; branch over trap
449 // udf ; divide by zero
450 // msub rd, rd, rm, rn ; rd = rn - rd * rm
451
452 // Check for divide by 0.
453 let trap_code = TrapCode::IntegerDivisionByZero;
454 ctx.emit(Inst::TrapIf {
455 trap_code,
456 kind: CondBrKind::Zero(rm),
457 });
458
459 ctx.emit(Inst::AluRRRR {
460 alu_op: ALUOp3::MSub64,
461 rd: rd,
462 rn: rd.to_reg(),
463 rm: rm,
464 ra: rn,
465 });
466 } else {
467 if div_op == ALUOp::SDiv64 {
468 // cbnz rm, #8
469 // udf ; divide by zero
470 // cmn rm, 1
471 // ccmp rn, 1, #nzcv, eq
472 // b.vc #8
473 // udf ; signed overflow
474
475 // Check for divide by 0.
476 let trap_code = TrapCode::IntegerDivisionByZero;
477 ctx.emit(Inst::TrapIf {
478 trap_code,
479 kind: CondBrKind::Zero(rm),
480 });
481
482 // Check for signed overflow. The only case is min_value / -1.
483 let ty = ty.unwrap();
484 // The following checks must be done in 32-bit or 64-bit, depending
485 // on the input type. Even though the initial div instruction is
486 // always done in 64-bit currently.
487 let size = OperandSize::from_ty(ty);
488 // Check RHS is -1.
489 ctx.emit(Inst::AluRRImm12 {
490 alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
491 rd: writable_zero_reg(),
492 rn: rm,
493 imm12: Imm12::maybe_from_u64(1).unwrap(),
494 });
495 // Check LHS is min_value, by subtracting 1 and branching if
496 // there is overflow.
497 ctx.emit(Inst::CCmpImm {
498 size,
499 rn,
500 imm: UImm5::maybe_from_u8(1).unwrap(),
501 nzcv: NZCV::new(false, false, false, false),
502 cond: Cond::Eq,
503 });
504 let trap_code = TrapCode::IntegerOverflow;
505 ctx.emit(Inst::TrapIf {
506 trap_code,
507 kind: CondBrKind::Cond(Cond::Vs),
508 });
509 } else {
510 // cbnz rm, #8
511 // udf ; divide by zero
512
513 // Check for divide by 0.
514 let trap_code = TrapCode::IntegerDivisionByZero;
515 ctx.emit(Inst::TrapIf {
516 trap_code,
517 kind: CondBrKind::Zero(rm),
518 });
519 }
520 }
521 }
522
523 Opcode::Uextend | Opcode::Sextend => {
524 if op == Opcode::Uextend {
525 let inputs = ctx.get_input_as_source_or_const(inputs[0].insn, inputs[0].input);
526 if let Some((atomic_load, 0)) = inputs.inst {
527 if ctx.data(atomic_load).opcode() == Opcode::AtomicLoad {
528 let output_ty = ty.unwrap();
529 assert!(output_ty == I32 || output_ty == I64);
530 let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
531 emit_atomic_load(ctx, rt, atomic_load);
532 ctx.sink_inst(atomic_load);
533 return Ok(());
534 }
535 }
536 }
537 let output_ty = ty.unwrap();
538 let input_ty = ctx.input_ty(insn, 0);
539 let from_bits = ty_bits(input_ty) as u8;
540 let to_bits = ty_bits(output_ty) as u8;
541 let to_bits = std::cmp::max(32, to_bits);
542 assert!(from_bits <= to_bits);
543
544 let signed = op == Opcode::Sextend;
545 let dst = get_output_reg(ctx, outputs[0]);
546 let src =
547 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
548 put_input_in_regs(
549 ctx,
550 InsnInput {
551 insn: extract_insn,
552 input: 0,
553 },
554 )
555 } else {
556 put_input_in_regs(ctx, inputs[0])
557 };
558
559 let needs_extend = from_bits < to_bits && to_bits <= 64;
560 // For i128, we want to extend the lower half, except if it is already 64 bits.
561 let needs_lower_extend = to_bits > 64 && from_bits < 64;
562 let pass_through_lower = to_bits > 64 && !needs_lower_extend;
563
564 if needs_extend || needs_lower_extend {
565 let rn = src.regs()[0];
566 let rd = dst.regs()[0];
567
568 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
569 let idx =
570 if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
571 *imm
572 } else {
573 unreachable!();
574 };
575
576 let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
577
578 if signed {
579 let scalar_size = OperandSize::from_ty(output_ty);
580
581 ctx.emit(Inst::MovFromVecSigned {
582 rd,
583 rn,
584 idx,
585 size,
586 scalar_size,
587 });
588 } else {
589 ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
590 }
591 } else {
592 // If we reach this point, we weren't able to incorporate the extend as
593 // a register-mode on another instruction, so we have a 'None'
594 // narrow-value/extend mode here, and we emit the explicit instruction.
595 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
596 ctx.emit(Inst::Extend {
597 rd,
598 rn,
599 signed,
600 from_bits,
601 to_bits: std::cmp::min(64, to_bits),
602 });
603 }
604 } else if pass_through_lower {
605 ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], I64));
606 }
607
608 if output_ty == I128 {
609 if signed {
610 ctx.emit(Inst::AluRRImmShift {
611 alu_op: ALUOp::Asr64,
612 rd: dst.regs()[1],
613 rn: dst.regs()[0].to_reg(),
614 immshift: ImmShift::maybe_from_u64(63).unwrap(),
615 });
616 } else {
617 lower_constant_u64(ctx, dst.regs()[1], 0);
618 }
619 }
620 }
621
622 Opcode::Bnot => {
623 let out_regs = get_output_reg(ctx, outputs[0]);
624 let ty = ty.unwrap();
625 if ty == I128 {
626 // TODO: We can merge this block with the one below once we support immlogic here
627 let in_regs = put_input_in_regs(ctx, inputs[0]);
628 ctx.emit(Inst::AluRRR {
629 alu_op: ALUOp::OrrNot64,
630 rd: out_regs.regs()[0],
631 rn: zero_reg(),
632 rm: in_regs.regs()[0],
633 });
634 ctx.emit(Inst::AluRRR {
635 alu_op: ALUOp::OrrNot64,
636 rd: out_regs.regs()[1],
637 rn: zero_reg(),
638 rm: in_regs.regs()[1],
639 });
640 } else if !ty.is_vector() {
641 let rd = out_regs.only_reg().unwrap();
642 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
643 let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
644 // NOT rd, rm ==> ORR_NOT rd, zero, rm
645 ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
646 } else {
647 let rd = out_regs.only_reg().unwrap();
648 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
649 ctx.emit(Inst::VecMisc {
650 op: VecMisc2::Not,
651 rd,
652 rn: rm,
653 size: VectorSize::from_ty(ty),
654 });
655 }
656 }
657
658 Opcode::Band
659 | Opcode::Bor
660 | Opcode::Bxor
661 | Opcode::BandNot
662 | Opcode::BorNot
663 | Opcode::BxorNot => {
664 let out_regs = get_output_reg(ctx, outputs[0]);
665 let ty = ty.unwrap();
666 if ty == I128 {
667 // TODO: Support immlogic here
668 let lhs = put_input_in_regs(ctx, inputs[0]);
669 let rhs = put_input_in_regs(ctx, inputs[1]);
670 let alu_op = match op {
671 Opcode::Band => ALUOp::And64,
672 Opcode::Bor => ALUOp::Orr64,
673 Opcode::Bxor => ALUOp::Eor64,
674 Opcode::BandNot => ALUOp::AndNot64,
675 Opcode::BorNot => ALUOp::OrrNot64,
676 Opcode::BxorNot => ALUOp::EorNot64,
677 _ => unreachable!(),
678 };
679
680 ctx.emit(Inst::AluRRR {
681 alu_op,
682 rd: out_regs.regs()[0],
683 rn: lhs.regs()[0],
684 rm: rhs.regs()[0],
685 });
686 ctx.emit(Inst::AluRRR {
687 alu_op,
688 rd: out_regs.regs()[1],
689 rn: lhs.regs()[1],
690 rm: rhs.regs()[1],
691 });
692 } else if !ty.is_vector() {
693 let rd = out_regs.only_reg().unwrap();
694 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
695 let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
696 let alu_op = match op {
697 Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
698 Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
699 Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
700 Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
701 Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
702 Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
703 _ => unreachable!(),
704 };
705 ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
706 } else {
707 let alu_op = match op {
708 Opcode::Band => VecALUOp::And,
709 Opcode::BandNot => VecALUOp::Bic,
710 Opcode::Bor => VecALUOp::Orr,
711 Opcode::Bxor => VecALUOp::Eor,
712 _ => unreachable!(),
713 };
714
715 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
716 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
717 let rd = out_regs.only_reg().unwrap();
718
719 ctx.emit(Inst::VecRRR {
720 alu_op,
721 rd,
722 rn,
723 rm,
724 size: VectorSize::from_ty(ty),
725 });
726 }
727 }
728
729 Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
730 let out_regs = get_output_reg(ctx, outputs[0]);
731 let ty = ty.unwrap();
732 if ty == I128 {
733 let src = put_input_in_regs(ctx, inputs[0]);
734 let amt = lower_shift_amt(ctx, inputs[1], ty, out_regs.regs()[0]).unwrap_reg();
735
736 match op {
737 Opcode::Ishl => emit_shl_i128(ctx, src, out_regs, amt),
738 Opcode::Ushr => {
739 emit_shr_i128(ctx, src, out_regs, amt, /* is_signed = */ false)
740 }
741 Opcode::Sshr => {
742 emit_shr_i128(ctx, src, out_regs, amt, /* is_signed = */ true)
743 }
744 _ => unreachable!(),
745 };
746 } else if !ty.is_vector() {
747 let rd = out_regs.only_reg().unwrap();
748 let size = OperandSize::from_bits(ty_bits(ty));
749 let narrow_mode = match (op, size) {
750 (Opcode::Ishl, _) => NarrowValueMode::None,
751 (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
752 (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
753 (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
754 (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
755 _ => unreachable!(),
756 };
757 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
758 let rm = lower_shift_amt(ctx, inputs[1], ty, out_regs.regs()[0]);
759 let alu_op = match op {
760 Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
761 Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
762 Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
763 _ => unreachable!(),
764 };
765 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
766 } else {
767 let rd = out_regs.only_reg().unwrap();
768 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
769 let size = VectorSize::from_ty(ty);
770 let (alu_op, is_right_shift) = match op {
771 Opcode::Ishl => (VecALUOp::Sshl, false),
772 Opcode::Ushr => (VecALUOp::Ushl, true),
773 Opcode::Sshr => (VecALUOp::Sshl, true),
774 _ => unreachable!(),
775 };
776
777 let rm = if is_right_shift {
778 // Right shifts are implemented with a negative left shift.
779 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
780 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
781 let rn = zero_reg();
782 ctx.emit(Inst::AluRRR {
783 alu_op: ALUOp::Sub32,
784 rd: tmp,
785 rn,
786 rm,
787 });
788 tmp.to_reg()
789 } else {
790 put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
791 };
792
793 ctx.emit(Inst::VecDup { rd, rn: rm, size });
794
795 ctx.emit(Inst::VecRRR {
796 alu_op,
797 rd,
798 rn,
799 rm: rd.to_reg(),
800 size,
801 });
802 }
803 }
804
805 Opcode::Rotr | Opcode::Rotl => {
806 // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
807 // effectively a right rotation of N - K places, if N is the integer's bit size. We
808 // implement left rotations with this trick.
809 //
810 // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
811 //
812 // For a < 32-bit rotate-right, we synthesize this as:
813 //
814 // rotr rd, rn, rm
815 //
816 // =>
817 //
818 // zero-extend rn, <32-or-64>
819 // and tmp_masked_rm, rm, <bitwidth - 1>
820 // sub tmp1, tmp_masked_rm, <bitwidth>
821 // sub tmp1, zero, tmp1 ; neg
822 // lsr tmp2, rn, tmp_masked_rm
823 // lsl rd, rn, tmp1
824 // orr rd, rd, tmp2
825 //
826 // For a constant amount, we can instead do:
827 //
828 // zero-extend rn, <32-or-64>
829 // lsr tmp2, rn, #<shiftimm>
830 // lsl rd, rn, <bitwidth - shiftimm>
831 // orr rd, rd, tmp2
832
833 let is_rotl = op == Opcode::Rotl;
834
835 let ty = ty.unwrap();
836 let ty_bits_size = ty_bits(ty) as u8;
837
838 // TODO: We can do much better codegen if we have a constant amt
839 if ty == I128 {
840 let dst = get_output_reg(ctx, outputs[0]);
841 let src = put_input_in_regs(ctx, inputs[0]);
842 let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
843
844 let tmp = ctx.alloc_tmp(I128);
845 let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
846
847 lower_constant_u64(ctx, inv_amt, 128);
848 ctx.emit(Inst::AluRRR {
849 alu_op: ALUOp::Sub64,
850 rd: inv_amt,
851 rn: inv_amt.to_reg(),
852 rm: amt_src,
853 });
854
855 if is_rotl {
856 // rotl
857 // (shl.i128 tmp, amt)
858 // (ushr.i128 dst, 128-amt)
859
860 emit_shl_i128(ctx, src, tmp, amt_src);
861 emit_shr_i128(
862 ctx,
863 src,
864 dst,
865 inv_amt.to_reg(),
866 /* is_signed = */ false,
867 );
868 } else {
869 // rotr
870 // (ushr.i128 tmp, amt)
871 // (shl.i128 dst, 128-amt)
872
873 emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
874 emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
875 }
876
877 ctx.emit(Inst::AluRRR {
878 alu_op: ALUOp::Orr64,
879 rd: dst.regs()[0],
880 rn: dst.regs()[0].to_reg(),
881 rm: tmp.regs()[0].to_reg(),
882 });
883 ctx.emit(Inst::AluRRR {
884 alu_op: ALUOp::Orr64,
885 rd: dst.regs()[1],
886 rn: dst.regs()[1].to_reg(),
887 rm: tmp.regs()[1].to_reg(),
888 });
889
890 return Ok(());
891 }
892
893 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
894 let rn = put_input_in_reg(
895 ctx,
896 inputs[0],
897 if ty_bits_size <= 32 {
898 NarrowValueMode::ZeroExtend32
899 } else {
900 NarrowValueMode::ZeroExtend64
901 },
902 );
903 let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
904
905 if ty_bits_size == 32 || ty_bits_size == 64 {
906 let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
907 match rm {
908 ResultRegImmShift::ImmShift(mut immshift) => {
909 if is_rotl {
910 immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
911 }
912 immshift.imm &= ty_bits_size - 1;
913 ctx.emit(Inst::AluRRImmShift {
914 alu_op,
915 rd,
916 rn,
917 immshift,
918 });
919 }
920
921 ResultRegImmShift::Reg(rm) => {
922 let rm = if is_rotl {
923 // Really ty_bits_size - rn, but the upper bits of the result are
924 // ignored (because of the implicit masking done by the instruction),
925 // so this is equivalent to negating the input.
926 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
927 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
928 ctx.emit(Inst::AluRRR {
929 alu_op,
930 rd: tmp,
931 rn: zero_reg(),
932 rm,
933 });
934 tmp.to_reg()
935 } else {
936 rm
937 };
938 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
939 }
940 }
941 } else {
942 debug_assert!(ty_bits_size < 32);
943
944 match rm {
945 ResultRegImmShift::Reg(reg) => {
946 let reg = if is_rotl {
947 // Really ty_bits_size - rn, but the upper bits of the result are
948 // ignored (because of the implicit masking done by the instruction),
949 // so this is equivalent to negating the input.
950 let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
951 ctx.emit(Inst::AluRRR {
952 alu_op: ALUOp::Sub32,
953 rd: tmp,
954 rn: zero_reg(),
955 rm: reg,
956 });
957 tmp.to_reg()
958 } else {
959 reg
960 };
961
962 // Explicitly mask the rotation count.
963 let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
964 ctx.emit(Inst::AluRRImmLogic {
965 alu_op: ALUOp::And32,
966 rd: tmp_masked_rm,
967 rn: reg,
968 imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
969 });
970 let tmp_masked_rm = tmp_masked_rm.to_reg();
971
972 let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
973 let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
974 ctx.emit(Inst::AluRRImm12 {
975 alu_op: ALUOp::Sub32,
976 rd: tmp1,
977 rn: tmp_masked_rm,
978 imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
979 });
980 ctx.emit(Inst::AluRRR {
981 alu_op: ALUOp::Sub32,
982 rd: tmp1,
983 rn: zero_reg(),
984 rm: tmp1.to_reg(),
985 });
986 ctx.emit(Inst::AluRRR {
987 alu_op: ALUOp::Lsr32,
988 rd: tmp2,
989 rn,
990 rm: tmp_masked_rm,
991 });
992 ctx.emit(Inst::AluRRR {
993 alu_op: ALUOp::Lsl32,
994 rd,
995 rn,
996 rm: tmp1.to_reg(),
997 });
998 ctx.emit(Inst::AluRRR {
999 alu_op: ALUOp::Orr32,
1000 rd,
1001 rn: rd.to_reg(),
1002 rm: tmp2.to_reg(),
1003 });
1004 }
1005
1006 ResultRegImmShift::ImmShift(mut immshift) => {
1007 if is_rotl {
1008 immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
1009 }
1010 immshift.imm &= ty_bits_size - 1;
1011
1012 let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
1013 ctx.emit(Inst::AluRRImmShift {
1014 alu_op: ALUOp::Lsr32,
1015 rd: tmp1,
1016 rn,
1017 immshift: immshift.clone(),
1018 });
1019
1020 let amount = immshift.value() & (ty_bits_size - 1);
1021 let opp_shift =
1022 ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
1023 ctx.emit(Inst::AluRRImmShift {
1024 alu_op: ALUOp::Lsl32,
1025 rd,
1026 rn,
1027 immshift: opp_shift,
1028 });
1029
1030 ctx.emit(Inst::AluRRR {
1031 alu_op: ALUOp::Orr32,
1032 rd,
1033 rn: rd.to_reg(),
1034 rm: tmp1.to_reg(),
1035 });
1036 }
1037 }
1038 }
1039 }
1040
1041 Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
1042 let ty = ty.unwrap();
1043 let op_ty = match ty {
1044 I8 | I16 | I32 => I32,
1045 I64 | I128 => I64,
1046 _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
1047 };
1048 let bitop = match op {
1049 Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
1050 Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
1051 _ => unreachable!(),
1052 };
1053
1054 if ty == I128 {
1055 let out_regs = get_output_reg(ctx, outputs[0]);
1056 let in_regs = put_input_in_regs(ctx, inputs[0]);
1057
1058 let in_lo = in_regs.regs()[0];
1059 let in_hi = in_regs.regs()[1];
1060 let out_lo = out_regs.regs()[0];
1061 let out_hi = out_regs.regs()[1];
1062
1063 if op == Opcode::Bitrev || op == Opcode::Ctz {
1064 ctx.emit(Inst::BitRR {
1065 rd: out_hi,
1066 rn: in_lo,
1067 op: bitop,
1068 });
1069 ctx.emit(Inst::BitRR {
1070 rd: out_lo,
1071 rn: in_hi,
1072 op: bitop,
1073 });
1074 }
1075
1076 if op == Opcode::Ctz {
1077 // We have reduced the problem to a clz by reversing the inputs previouly
1078 emit_clz_i128(ctx, out_regs.map(|r| r.to_reg()), out_regs);
1079 } else if op == Opcode::Clz {
1080 emit_clz_i128(ctx, in_regs, out_regs);
1081 } else if op == Opcode::Cls {
1082 // cls out_hi, in_hi
1083 // cls out_lo, in_lo
1084 // eon sign_eq, in_hi, in_lo
1085 // lsr sign_eq, sign_eq, #63
1086 // madd out_lo, out_lo, sign_eq, sign_eq
1087 // cmp out_hi, #63
1088 // csel out_lo, out_lo, xzr, eq
1089 // add out_lo, out_lo, out_hi
1090 // mov out_hi, 0
1091
1092 let sign_eq = ctx.alloc_tmp(I64).only_reg().unwrap();
1093 let xzr = writable_zero_reg();
1094
1095 ctx.emit(Inst::BitRR {
1096 rd: out_lo,
1097 rn: in_lo,
1098 op: bitop,
1099 });
1100 ctx.emit(Inst::BitRR {
1101 rd: out_hi,
1102 rn: in_hi,
1103 op: bitop,
1104 });
1105 ctx.emit(Inst::AluRRR {
1106 alu_op: ALUOp::EorNot64,
1107 rd: sign_eq,
1108 rn: in_hi,
1109 rm: in_lo,
1110 });
1111 ctx.emit(Inst::AluRRImmShift {
1112 alu_op: ALUOp::Lsr64,
1113 rd: sign_eq,
1114 rn: sign_eq.to_reg(),
1115 immshift: ImmShift::maybe_from_u64(63).unwrap(),
1116 });
1117 ctx.emit(Inst::AluRRRR {
1118 alu_op: ALUOp3::MAdd64,
1119 rd: out_lo,
1120 rn: out_lo.to_reg(),
1121 rm: sign_eq.to_reg(),
1122 ra: sign_eq.to_reg(),
1123 });
1124 ctx.emit(Inst::AluRRImm12 {
1125 alu_op: ALUOp::SubS64,
1126 rd: xzr,
1127 rn: out_hi.to_reg(),
1128 imm12: Imm12::maybe_from_u64(63).unwrap(),
1129 });
1130 ctx.emit(Inst::CSel {
1131 cond: Cond::Eq,
1132 rd: out_lo,
1133 rn: out_lo.to_reg(),
1134 rm: xzr.to_reg(),
1135 });
1136 ctx.emit(Inst::AluRRR {
1137 alu_op: ALUOp::Add64,
1138 rd: out_lo,
1139 rn: out_lo.to_reg(),
1140 rm: out_hi.to_reg(),
1141 });
1142 lower_constant_u64(ctx, out_hi, 0);
1143 }
1144 } else {
1145 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1146 let needs_zext = match op {
1147 Opcode::Bitrev | Opcode::Ctz => false,
1148 Opcode::Clz | Opcode::Cls => true,
1149 _ => unreachable!(),
1150 };
1151 let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
1152 NarrowValueMode::ZeroExtend64
1153 } else if needs_zext {
1154 NarrowValueMode::ZeroExtend32
1155 } else {
1156 NarrowValueMode::None
1157 };
1158 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1159
1160 ctx.emit(Inst::BitRR { rd, rn, op: bitop });
1161
1162 // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
1163 // to a clz, and bitrev as the main operation.
1164 if op == Opcode::Bitrev || op == Opcode::Ctz {
1165 // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
1166 // the reversed result in the highest n bits, so we need to shift them down into
1167 // place.
1168 let right_shift = match ty {
1169 I8 => Some(24),
1170 I16 => Some(16),
1171 I32 => None,
1172 I64 => None,
1173 _ => panic!("Unsupported type for Bitrev"),
1174 };
1175 if let Some(s) = right_shift {
1176 ctx.emit(Inst::AluRRImmShift {
1177 alu_op: ALUOp::Lsr32,
1178 rd,
1179 rn: rd.to_reg(),
1180 immshift: ImmShift::maybe_from_u64(s).unwrap(),
1181 });
1182 }
1183 }
1184
1185 if op == Opcode::Ctz {
1186 ctx.emit(Inst::BitRR {
1187 op: BitOp::from((Opcode::Clz, op_ty)),
1188 rd,
1189 rn: rd.to_reg(),
1190 });
1191 }
1192 }
1193 }
1194
1195 Opcode::Popcnt => {
1196 let ty = ty.unwrap();
1197
1198 if ty.is_vector() {
1199 let lane_type = ty.lane_type();
1200 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1201 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1202
1203 if lane_type != I8 {
1204 return Err(CodegenError::Unsupported(format!(
1205 "Unsupported SIMD vector lane type: {:?}",
1206 lane_type
1207 )));
1208 }
1209
1210 ctx.emit(Inst::VecMisc {
1211 op: VecMisc2::Cnt,
1212 rd,
1213 rn,
1214 size: VectorSize::from_ty(ty),
1215 });
1216 } else {
1217 let out_regs = get_output_reg(ctx, outputs[0]);
1218 let in_regs = put_input_in_regs(ctx, inputs[0]);
1219 let size = if ty == I128 {
1220 ScalarSize::Size64
1221 } else {
1222 ScalarSize::from_operand_size(OperandSize::from_ty(ty))
1223 };
1224
1225 let vec_size = if ty == I128 {
1226 VectorSize::Size8x16
1227 } else {
1228 VectorSize::Size8x8
1229 };
1230
1231 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
1232
1233 // fmov tmp, in_lo
1234 // if ty == i128:
1235 // mov tmp.d[1], in_hi
1236 //
1237 // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
1238 // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
1239 //
1240 // umov out_lo, tmp.b[0]
1241 // if ty == i128:
1242 // mov out_hi, 0
1243
1244 ctx.emit(Inst::MovToFpu {
1245 rd: tmp,
1246 rn: in_regs.regs()[0],
1247 size,
1248 });
1249
1250 if ty == I128 {
1251 ctx.emit(Inst::MovToVec {
1252 rd: tmp,
1253 rn: in_regs.regs()[1],
1254 idx: 1,
1255 size: VectorSize::Size64x2,
1256 });
1257 }
1258
1259 ctx.emit(Inst::VecMisc {
1260 op: VecMisc2::Cnt,
1261 rd: tmp,
1262 rn: tmp.to_reg(),
1263 size: vec_size,
1264 });
1265
1266 match ScalarSize::from_ty(ty) {
1267 ScalarSize::Size8 => {}
1268 ScalarSize::Size16 => {
1269 // ADDP is usually cheaper than ADDV.
1270 ctx.emit(Inst::VecRRR {
1271 alu_op: VecALUOp::Addp,
1272 rd: tmp,
1273 rn: tmp.to_reg(),
1274 rm: tmp.to_reg(),
1275 size: VectorSize::Size8x8,
1276 });
1277 }
1278 ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => {
1279 ctx.emit(Inst::VecLanes {
1280 op: VecLanesOp::Addv,
1281 rd: tmp,
1282 rn: tmp.to_reg(),
1283 size: vec_size,
1284 });
1285 }
1286 }
1287
1288 ctx.emit(Inst::MovFromVec {
1289 rd: out_regs.regs()[0],
1290 rn: tmp.to_reg(),
1291 idx: 0,
1292 size: VectorSize::Size8x16,
1293 });
1294
1295 if ty == I128 {
1296 lower_constant_u64(ctx, out_regs.regs()[1], 0);
1297 }
1298 }
1299 }
1300
1301 Opcode::Load
1302 | Opcode::Uload8
1303 | Opcode::Sload8
1304 | Opcode::Uload16
1305 | Opcode::Sload16
1306 | Opcode::Uload32
1307 | Opcode::Sload32
1308 | Opcode::LoadComplex
1309 | Opcode::Uload8Complex
1310 | Opcode::Sload8Complex
1311 | Opcode::Uload16Complex
1312 | Opcode::Sload16Complex
1313 | Opcode::Uload32Complex
1314 | Opcode::Sload32Complex
1315 | Opcode::Sload8x8
1316 | Opcode::Uload8x8
1317 | Opcode::Sload16x4
1318 | Opcode::Uload16x4
1319 | Opcode::Sload32x2
1320 | Opcode::Uload32x2
1321 | Opcode::Uload8x8Complex
1322 | Opcode::Sload8x8Complex
1323 | Opcode::Uload16x4Complex
1324 | Opcode::Sload16x4Complex
1325 | Opcode::Uload32x2Complex
1326 | Opcode::Sload32x2Complex => {
1327 let sign_extend = match op {
1328 Opcode::Sload8
1329 | Opcode::Sload8Complex
1330 | Opcode::Sload16
1331 | Opcode::Sload16Complex
1332 | Opcode::Sload32
1333 | Opcode::Sload32Complex => true,
1334 _ => false,
1335 };
1336 let flags = ctx
1337 .memflags(insn)
1338 .expect("Load instruction should have memflags");
1339
1340 let out_ty = ctx.output_ty(insn, 0);
1341 if out_ty == I128 {
1342 let off = ctx.data(insn).load_store_offset().unwrap();
1343 let mem = lower_pair_address(ctx, &inputs[..], off);
1344 let dst = get_output_reg(ctx, outputs[0]);
1345 ctx.emit(Inst::LoadP64 {
1346 rt: dst.regs()[0],
1347 rt2: dst.regs()[1],
1348 mem,
1349 flags,
1350 });
1351 } else {
1352 lower_load(
1353 ctx,
1354 insn,
1355 &inputs[..],
1356 outputs[0],
1357 |ctx, dst, elem_ty, mem| {
1358 let rd = dst.only_reg().unwrap();
1359 let is_float = ty_has_float_or_vec_representation(elem_ty);
1360 ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
1361 (1, _, _) => Inst::ULoad8 { rd, mem, flags },
1362 (8, false, _) => Inst::ULoad8 { rd, mem, flags },
1363 (8, true, _) => Inst::SLoad8 { rd, mem, flags },
1364 (16, false, _) => Inst::ULoad16 { rd, mem, flags },
1365 (16, true, _) => Inst::SLoad16 { rd, mem, flags },
1366 (32, false, false) => Inst::ULoad32 { rd, mem, flags },
1367 (32, true, false) => Inst::SLoad32 { rd, mem, flags },
1368 (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
1369 (64, _, false) => Inst::ULoad64 { rd, mem, flags },
1370 // Note that we treat some of the vector loads as scalar floating-point loads,
1371 // which is correct in a little endian environment.
1372 (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
1373 (128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
1374 _ => panic!("Unsupported size in load"),
1375 });
1376
1377 let vec_extend = match op {
1378 Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
1379 Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
1380 Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
1381 Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
1382 Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
1383 Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
1384 Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
1385 Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
1386 Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
1387 Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
1388 Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
1389 Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
1390 _ => None,
1391 };
1392
1393 if let Some(t) = vec_extend {
1394 let rd = dst.only_reg().unwrap();
1395 ctx.emit(Inst::VecExtend {
1396 t,
1397 rd,
1398 rn: rd.to_reg(),
1399 high_half: false,
1400 });
1401 }
1402 },
1403 );
1404 }
1405 }
1406
1407 Opcode::Store
1408 | Opcode::Istore8
1409 | Opcode::Istore16
1410 | Opcode::Istore32
1411 | Opcode::StoreComplex
1412 | Opcode::Istore8Complex
1413 | Opcode::Istore16Complex
1414 | Opcode::Istore32Complex => {
1415 let off = ctx.data(insn).load_store_offset().unwrap();
1416 let elem_ty = match op {
1417 Opcode::Istore8 | Opcode::Istore8Complex => I8,
1418 Opcode::Istore16 | Opcode::Istore16Complex => I16,
1419 Opcode::Istore32 | Opcode::Istore32Complex => I32,
1420 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
1421 _ => unreachable!(),
1422 };
1423 let is_float = ty_has_float_or_vec_representation(elem_ty);
1424 let flags = ctx
1425 .memflags(insn)
1426 .expect("Store instruction should have memflags");
1427
1428 let dst = put_input_in_regs(ctx, inputs[0]);
1429
1430 if elem_ty == I128 {
1431 let mem = lower_pair_address(ctx, &inputs[1..], off);
1432 ctx.emit(Inst::StoreP64 {
1433 rt: dst.regs()[0],
1434 rt2: dst.regs()[1],
1435 mem,
1436 flags,
1437 });
1438 } else {
1439 let rd = dst.only_reg().unwrap();
1440 let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
1441 ctx.emit(match (ty_bits(elem_ty), is_float) {
1442 (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
1443 (16, _) => Inst::Store16 { rd, mem, flags },
1444 (32, false) => Inst::Store32 { rd, mem, flags },
1445 (32, true) => Inst::FpuStore32 { rd, mem, flags },
1446 (64, false) => Inst::Store64 { rd, mem, flags },
1447 (64, true) => Inst::FpuStore64 { rd, mem, flags },
1448 (128, _) => Inst::FpuStore128 { rd, mem, flags },
1449 _ => panic!("Unsupported size in store"),
1450 });
1451 }
1452 }
1453
1454 Opcode::StackAddr => {
1455 let (stack_slot, offset) = match *ctx.data(insn) {
1456 InstructionData::StackLoad {
1457 opcode: Opcode::StackAddr,
1458 stack_slot,
1459 offset,
1460 } => (stack_slot, offset),
1461 _ => unreachable!(),
1462 };
1463 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1464 let offset: i32 = offset.into();
1465 let inst = ctx
1466 .abi()
1467 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
1468 ctx.emit(inst);
1469 }
1470
1471 Opcode::AtomicRmw => {
1472 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1473 let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1474 let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1475 let ty_access = ty.unwrap();
1476 assert!(is_valid_atomic_transaction_ty(ty_access));
1477 // Make sure that both args are in virtual regs, since in effect
1478 // we have to do a parallel copy to get them safely to the AtomicRMW input
1479 // regs, and that's not guaranteed safe if either is in a real reg.
1480 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1481 r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
1482 // Move the args to the preordained AtomicRMW input regs
1483 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1484 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
1485 // Now the AtomicRMW insn itself
1486 let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
1487 ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
1488 // And finally, copy the preordained AtomicRMW output reg to its destination.
1489 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1490 // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
1491 }
1492
1493 Opcode::AtomicCas => {
1494 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1495 let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1496 let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1497 let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1498 let ty_access = ty.unwrap();
1499 assert!(is_valid_atomic_transaction_ty(ty_access));
1500
1501 if isa_flags.use_lse() {
1502 ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
1503 ctx.emit(Inst::AtomicCAS {
1504 rs: r_dst,
1505 rt: r_replacement,
1506 rn: r_addr,
1507 ty: ty_access,
1508 });
1509 } else {
1510 // This is very similar to, but not identical to, the AtomicRmw case. Note
1511 // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
1512 // about zero-extending narrow (I8/I16/I32) values here.
1513 // Make sure that all three args are in virtual regs. See corresponding comment
1514 // for `Opcode::AtomicRmw` above.
1515 r_addr = ctx.ensure_in_vreg(r_addr, I64);
1516 r_expected = ctx.ensure_in_vreg(r_expected, I64);
1517 r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
1518 // Move the args to the preordained AtomicCASLoop input regs
1519 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
1520 ctx.emit(Inst::gen_move(
1521 Writable::from_reg(xreg(26)),
1522 r_expected,
1523 I64,
1524 ));
1525 ctx.emit(Inst::gen_move(
1526 Writable::from_reg(xreg(28)),
1527 r_replacement,
1528 I64,
1529 ));
1530 // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
1531 ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
1532 // And finally, copy the preordained AtomicCASLoop output reg to its destination.
1533 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
1534 // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
1535 }
1536 }
1537
1538 Opcode::AtomicLoad => {
1539 let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1540 emit_atomic_load(ctx, rt, insn);
1541 }
1542
1543 Opcode::AtomicStore => {
1544 let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1545 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1546 let access_ty = ctx.input_ty(insn, 0);
1547 assert!(is_valid_atomic_transaction_ty(access_ty));
1548 ctx.emit(Inst::StoreRelease { access_ty, rt, rn });
1549 }
1550
1551 Opcode::Fence => {
1552 ctx.emit(Inst::Fence {});
1553 }
1554
1555 Opcode::StackLoad | Opcode::StackStore => {
1556 panic!("Direct stack memory access not supported; should not be used by Wasm");
1557 }
1558
1559 Opcode::HeapAddr => {
1560 panic!("heap_addr should have been removed by legalization!");
1561 }
1562
1563 Opcode::TableAddr => {
1564 panic!("table_addr should have been removed by legalization!");
1565 }
1566
1567 Opcode::Nop => {
1568 // Nothing.
1569 }
1570
1571 Opcode::Select => {
1572 let flag_input = inputs[0];
1573 let cond = if let Some(icmp_insn) =
1574 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
1575 {
1576 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
1577 lower_icmp(ctx, icmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond()
1578 } else if let Some(fcmp_insn) =
1579 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
1580 {
1581 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
1582 let cond = lower_fp_condcode(condcode);
1583 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
1584 cond
1585 } else {
1586 let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
1587 (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
1588 } else {
1589 (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
1590 };
1591
1592 let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
1593 // cmp rcond, #0
1594 ctx.emit(Inst::AluRRR {
1595 alu_op: cmp_op,
1596 rd: writable_zero_reg(),
1597 rn: rcond,
1598 rm: zero_reg(),
1599 });
1600 Cond::Ne
1601 };
1602
1603 // csel.cond rd, rn, rm
1604 let ty = ctx.output_ty(insn, 0);
1605 let bits = ty_bits(ty);
1606 let is_float = ty_has_float_or_vec_representation(ty);
1607
1608 let dst = get_output_reg(ctx, outputs[0]);
1609 let lhs = put_input_in_regs(ctx, inputs[1]);
1610 let rhs = put_input_in_regs(ctx, inputs[2]);
1611
1612 let rd = dst.regs()[0];
1613 let rn = lhs.regs()[0];
1614 let rm = rhs.regs()[0];
1615
1616 match (is_float, bits) {
1617 (true, 32) => ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }),
1618 (true, 64) => ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }),
1619 (true, 128) => ctx.emit(Inst::VecCSel { cond, rd, rn, rm }),
1620 (false, 128) => {
1621 ctx.emit(Inst::CSel {
1622 cond,
1623 rd: dst.regs()[0],
1624 rn: lhs.regs()[0],
1625 rm: rhs.regs()[0],
1626 });
1627 ctx.emit(Inst::CSel {
1628 cond,
1629 rd: dst.regs()[1],
1630 rn: lhs.regs()[1],
1631 rm: rhs.regs()[1],
1632 });
1633 }
1634 (_, _) => ctx.emit(Inst::CSel { cond, rd, rn, rm }),
1635 }
1636 }
1637
1638 Opcode::Selectif | Opcode::SelectifSpectreGuard => {
1639 let condcode = ctx.data(insn).cond_code().unwrap();
1640 // Verification ensures that the input is always a
1641 // single-def ifcmp.
1642 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1643 let cond = lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
1644
1645 // csel.COND rd, rn, rm
1646 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1647 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1648 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1649 let ty = ctx.output_ty(insn, 0);
1650 let bits = ty_bits(ty);
1651 let is_float = ty_has_float_or_vec_representation(ty);
1652 if is_float && bits == 32 {
1653 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
1654 } else if is_float && bits == 64 {
1655 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
1656 } else {
1657 ctx.emit(Inst::CSel { cond, rd, rn, rm });
1658 }
1659 }
1660
1661 Opcode::Bitselect | Opcode::Vselect => {
1662 let ty = ty.unwrap();
1663 if !ty.is_vector() {
1664 debug_assert_ne!(Opcode::Vselect, op);
1665 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
1666 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1667 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1668 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1669 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1670 // AND rTmp, rn, rcond
1671 ctx.emit(Inst::AluRRR {
1672 alu_op: ALUOp::And64,
1673 rd: tmp,
1674 rn,
1675 rm: rcond,
1676 });
1677 // BIC rd, rm, rcond
1678 ctx.emit(Inst::AluRRR {
1679 alu_op: ALUOp::AndNot64,
1680 rd,
1681 rn: rm,
1682 rm: rcond,
1683 });
1684 // ORR rd, rd, rTmp
1685 ctx.emit(Inst::AluRRR {
1686 alu_op: ALUOp::Orr64,
1687 rd,
1688 rn: rd.to_reg(),
1689 rm: tmp.to_reg(),
1690 });
1691 } else {
1692 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1693 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1694 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
1695 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1696 ctx.emit(Inst::gen_move(rd, rcond, ty));
1697
1698 ctx.emit(Inst::VecRRR {
1699 alu_op: VecALUOp::Bsl,
1700 rd,
1701 rn,
1702 rm,
1703 size: VectorSize::from_ty(ty),
1704 });
1705 }
1706 }
1707
1708 Opcode::Trueif => {
1709 let condcode = ctx.data(insn).cond_code().unwrap();
1710 // Verification ensures that the input is always a
1711 // single-def ifcmp.
1712 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1713 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1714 lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::Register(rd))?;
1715 }
1716
1717 Opcode::Trueff => {
1718 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1719 let cond = lower_fp_condcode(condcode);
1720 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1721 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1722 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1723 materialize_bool_result(ctx, insn, rd, cond);
1724 }
1725
1726 Opcode::IsNull | Opcode::IsInvalid => {
1727 // Null references are represented by the constant value 0; invalid references are
1728 // represented by the constant value -1. See `define_reftypes()` in
1729 // `meta/src/isa/x86/encodings.rs` to confirm.
1730 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1731 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1732 let ty = ctx.input_ty(insn, 0);
1733 let (alu_op, const_value) = match op {
1734 Opcode::IsNull => {
1735 // cmp rn, #0
1736 (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
1737 }
1738 Opcode::IsInvalid => {
1739 // cmn rn, #1
1740 (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
1741 }
1742 _ => unreachable!(),
1743 };
1744 let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
1745 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
1746 materialize_bool_result(ctx, insn, rd, Cond::Eq);
1747 }
1748
1749 Opcode::Copy => {
1750 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1751 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1752 let ty = ctx.input_ty(insn, 0);
1753 ctx.emit(Inst::gen_move(rd, rn, ty));
1754 }
1755
1756 Opcode::Breduce | Opcode::Ireduce => {
1757 // Smaller integers/booleans are stored with high-order bits
1758 // undefined, so we can simply do a copy.
1759 let rn = put_input_in_regs(ctx, inputs[0]).regs()[0];
1760 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1761 let ty = ctx.input_ty(insn, 0);
1762 ctx.emit(Inst::gen_move(rd, rn, ty));
1763 }
1764
1765 Opcode::Bextend | Opcode::Bmask => {
1766 // Bextend and Bmask both simply sign-extend. This works for:
1767 // - Bextend, because booleans are stored as 0 / -1, so we
1768 // sign-extend the -1 to a -1 in the wider width.
1769 // - Bmask, because the resulting integer mask value must be
1770 // all-ones (-1) if the argument is true.
1771
1772 let from_ty = ctx.input_ty(insn, 0);
1773 let to_ty = ctx.output_ty(insn, 0);
1774 let from_bits = ty_bits(from_ty);
1775 let to_bits = ty_bits(to_ty);
1776
1777 assert!(
1778 from_bits <= 64 && to_bits <= 64,
1779 "Vector Bextend not supported yet"
1780 );
1781 assert!(from_bits <= to_bits);
1782
1783 if from_bits == to_bits {
1784 // Nothing.
1785 } else {
1786 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1787 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1788 let to_bits = if to_bits == 64 {
1789 64
1790 } else {
1791 assert!(to_bits <= 32);
1792 32
1793 };
1794 let from_bits = from_bits as u8;
1795 ctx.emit(Inst::Extend {
1796 rd,
1797 rn,
1798 signed: true,
1799 from_bits,
1800 to_bits,
1801 });
1802 }
1803 }
1804
1805 Opcode::Bint => {
1806 // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
1807 // out the LSB to give a 0 / 1-valued integer result.
1808 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1809 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1810 let output_bits = ty_bits(ctx.output_ty(insn, 0));
1811
1812 let (imm_ty, alu_op) = if output_bits > 32 {
1813 (I64, ALUOp::And64)
1814 } else {
1815 (I32, ALUOp::And32)
1816 };
1817 ctx.emit(Inst::AluRRImmLogic {
1818 alu_op,
1819 rd,
1820 rn,
1821 imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
1822 });
1823 }
1824
1825 Opcode::Bitcast => {
1826 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1827 let ity = ctx.input_ty(insn, 0);
1828 let oty = ctx.output_ty(insn, 0);
1829 let ity_bits = ty_bits(ity);
1830 let ity_vec_reg = ty_has_float_or_vec_representation(ity);
1831 let oty_bits = ty_bits(oty);
1832 let oty_vec_reg = ty_has_float_or_vec_representation(oty);
1833
1834 debug_assert_eq!(ity_bits, oty_bits);
1835
1836 match (ity_vec_reg, oty_vec_reg) {
1837 (true, true) => {
1838 let narrow_mode = if ity_bits <= 32 {
1839 NarrowValueMode::ZeroExtend32
1840 } else {
1841 NarrowValueMode::ZeroExtend64
1842 };
1843 let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
1844 ctx.emit(Inst::gen_move(rd, rm, oty));
1845 }
1846 (false, false) => {
1847 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1848 ctx.emit(Inst::gen_move(rd, rm, oty));
1849 }
1850 (false, true) => {
1851 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1852 ctx.emit(Inst::MovToFpu {
1853 rd,
1854 rn,
1855 size: ScalarSize::Size64,
1856 });
1857 }
1858 (true, false) => {
1859 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1860 let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
1861
1862 ctx.emit(Inst::MovFromVec {
1863 rd,
1864 rn,
1865 idx: 0,
1866 size,
1867 });
1868 }
1869 }
1870 }
1871
1872 Opcode::FallthroughReturn | Opcode::Return => {
1873 for (i, input) in inputs.iter().enumerate() {
1874 // N.B.: according to the AArch64 ABI, the top bits of a register
1875 // (above the bits for the value's type) are undefined, so we
1876 // need not extend the return values.
1877 let src_regs = put_input_in_regs(ctx, *input);
1878 let retval_regs = ctx.retval(i);
1879
1880 assert_eq!(src_regs.len(), retval_regs.len());
1881 let ty = ctx.input_ty(insn, i);
1882 let (_, tys) = Inst::rc_for_type(ty)?;
1883
1884 src_regs
1885 .regs()
1886 .iter()
1887 .zip(retval_regs.regs().iter())
1888 .zip(tys.iter())
1889 .for_each(|((&src, &dst), &ty)| {
1890 ctx.emit(Inst::gen_move(dst, src, ty));
1891 });
1892 }
1893 // N.B.: the Ret itself is generated by the ABI.
1894 }
1895
1896 Opcode::Ifcmp | Opcode::Ffcmp => {
1897 // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
1898 // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
1899 // the same block, or a dominating block. In other words, it cannot pass through a BB
1900 // param (phi). The flags pass of the verifier will ensure this.
1901 panic!("Should never reach ifcmp as isel root!");
1902 }
1903
1904 Opcode::Icmp => {
1905 let condcode = ctx.data(insn).cond_code().unwrap();
1906 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1907 lower_icmp(ctx, insn, condcode, IcmpOutput::Register(rd))?;
1908 }
1909
1910 Opcode::Fcmp => {
1911 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1912 let cond = lower_fp_condcode(condcode);
1913 let ty = ctx.input_ty(insn, 0);
1914 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1915 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
1916 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1917
1918 if !ty.is_vector() {
1919 match ty_bits(ty) {
1920 32 => {
1921 ctx.emit(Inst::FpuCmp32 { rn, rm });
1922 }
1923 64 => {
1924 ctx.emit(Inst::FpuCmp64 { rn, rm });
1925 }
1926 _ => panic!("Bad float size"),
1927 }
1928 materialize_bool_result(ctx, insn, rd, cond);
1929 } else {
1930 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
1931 }
1932 }
1933
1934 Opcode::JumpTableEntry | Opcode::JumpTableBase => {
1935 panic!("Should not appear: we handle BrTable directly");
1936 }
1937
1938 Opcode::Debugtrap => {
1939 ctx.emit(Inst::Brk);
1940 }
1941
1942 Opcode::Trap | Opcode::ResumableTrap => {
1943 let trap_code = ctx.data(insn).trap_code().unwrap();
1944 ctx.emit_safepoint(Inst::Udf { trap_code });
1945 }
1946
1947 Opcode::Trapif | Opcode::Trapff => {
1948 let trap_code = ctx.data(insn).trap_code().unwrap();
1949
1950 let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
1951 let condcode = ctx.data(insn).cond_code().unwrap();
1952 let cond = lower_condcode(condcode);
1953 // The flags must not have been clobbered by any other
1954 // instruction between the iadd_ifcout and this instruction, as
1955 // verified by the CLIF validator; so we can simply use the
1956 // flags here.
1957 cond
1958 } else if op == Opcode::Trapif {
1959 let condcode = ctx.data(insn).cond_code().unwrap();
1960
1961 // Verification ensures that the input is always a single-def ifcmp.
1962 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
1963 lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond()
1964 } else {
1965 let condcode = ctx.data(insn).fp_cond_code().unwrap();
1966 let cond = lower_fp_condcode(condcode);
1967
1968 // Verification ensures that the input is always a
1969 // single-def ffcmp.
1970 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
1971 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
1972 cond
1973 };
1974
1975 ctx.emit_safepoint(Inst::TrapIf {
1976 trap_code,
1977 kind: CondBrKind::Cond(cond),
1978 });
1979 }
1980
1981 Opcode::Safepoint => {
1982 panic!("safepoint instructions not used by new backend's safepoints!");
1983 }
1984
1985 Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
1986 panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
1987 }
1988
1989 Opcode::FuncAddr => {
1990 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1991 let (extname, _) = ctx.call_target(insn).unwrap();
1992 let extname = extname.clone();
1993 ctx.emit(Inst::LoadExtName {
1994 rd,
1995 name: Box::new(extname),
1996 offset: 0,
1997 });
1998 }
1999
2000 Opcode::GlobalValue => {
2001 panic!("global_value should have been removed by legalization!");
2002 }
2003
2004 Opcode::SymbolValue => {
2005 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2006 let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
2007 let extname = extname.clone();
2008 ctx.emit(Inst::LoadExtName {
2009 rd,
2010 name: Box::new(extname),
2011 offset,
2012 });
2013 }
2014
2015 Opcode::Call | Opcode::CallIndirect => {
2016 let caller_conv = ctx.abi().call_conv();
2017 let (mut abi, inputs) = match op {
2018 Opcode::Call => {
2019 let (extname, dist) = ctx.call_target(insn).unwrap();
2020 let extname = extname.clone();
2021 let sig = ctx.call_sig(insn).unwrap();
2022 assert!(inputs.len() == sig.params.len());
2023 assert!(outputs.len() == sig.returns.len());
2024 (
2025 AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
2026 &inputs[..],
2027 )
2028 }
2029 Opcode::CallIndirect => {
2030 let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
2031 let sig = ctx.call_sig(insn).unwrap();
2032 assert!(inputs.len() - 1 == sig.params.len());
2033 assert!(outputs.len() == sig.returns.len());
2034 (
2035 AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
2036 &inputs[1..],
2037 )
2038 }
2039 _ => unreachable!(),
2040 };
2041
2042 abi.emit_stack_pre_adjust(ctx);
2043 assert!(inputs.len() == abi.num_args());
2044 for i in abi.get_copy_to_arg_order() {
2045 let input = inputs[i];
2046 let arg_regs = put_input_in_regs(ctx, input);
2047 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
2048 }
2049 abi.emit_call(ctx);
2050 for (i, output) in outputs.iter().enumerate() {
2051 let retval_regs = get_output_reg(ctx, *output);
2052 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
2053 }
2054 abi.emit_stack_post_adjust(ctx);
2055 }
2056
2057 Opcode::GetPinnedReg => {
2058 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2059 ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
2060 }
2061
2062 Opcode::SetPinnedReg => {
2063 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2064 ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
2065 }
2066
2067 Opcode::Spill
2068 | Opcode::Fill
2069 | Opcode::FillNop
2070 | Opcode::Regmove
2071 | Opcode::CopySpecial
2072 | Opcode::CopyToSsa
2073 | Opcode::CopyNop
2074 | Opcode::AdjustSpDown
2075 | Opcode::AdjustSpUpImm
2076 | Opcode::AdjustSpDownImm
2077 | Opcode::IfcmpSp
2078 | Opcode::Regspill
2079 | Opcode::Regfill => {
2080 panic!("Unused opcode should not be encountered.");
2081 }
2082
2083 Opcode::Jump
2084 | Opcode::Fallthrough
2085 | Opcode::Brz
2086 | Opcode::Brnz
2087 | Opcode::BrIcmp
2088 | Opcode::Brif
2089 | Opcode::Brff
2090 | Opcode::IndirectJumpTableBr
2091 | Opcode::BrTable => {
2092 panic!("Branch opcode reached non-branch lowering logic!");
2093 }
2094
2095 Opcode::Vconst => {
2096 let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
2097 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2098 lower_constant_f128(ctx, rd, value);
2099 }
2100
2101 Opcode::RawBitcast => {
2102 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2103 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2104 let ty = ctx.input_ty(insn, 0);
2105 ctx.emit(Inst::gen_move(rd, rm, ty));
2106 }
2107
2108 Opcode::Extractlane => {
2109 if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
2110 let idx = *imm;
2111 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2112 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2113 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
2114 let ty = ty.unwrap();
2115
2116 if ty_has_int_representation(ty) {
2117 ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
2118 // Plain moves are faster on some processors.
2119 } else if idx == 0 {
2120 ctx.emit(Inst::gen_move(rd, rn, ty));
2121 } else {
2122 ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
2123 }
2124 } else {
2125 unreachable!();
2126 }
2127 }
2128
2129 Opcode::Insertlane => {
2130 let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
2131 *imm
2132 } else {
2133 unreachable!();
2134 };
2135 let input_ty = ctx.input_ty(insn, 1);
2136 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2137 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2138 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2139 let ty = ty.unwrap();
2140 let size = VectorSize::from_ty(ty);
2141
2142 ctx.emit(Inst::gen_move(rd, rm, ty));
2143
2144 if ty_has_int_representation(input_ty) {
2145 ctx.emit(Inst::MovToVec { rd, rn, idx, size });
2146 } else {
2147 ctx.emit(Inst::VecMovElement {
2148 rd,
2149 rn,
2150 dest_idx: idx,
2151 src_idx: 0,
2152 size,
2153 });
2154 }
2155 }
2156
2157 Opcode::Splat => {
2158 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2159 let size = VectorSize::from_ty(ty.unwrap());
2160
2161 if let Some((_, insn)) = maybe_input_insn_multi(
2162 ctx,
2163 inputs[0],
2164 &[
2165 Opcode::Bconst,
2166 Opcode::F32const,
2167 Opcode::F64const,
2168 Opcode::Iconst,
2169 ],
2170 ) {
2171 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2172 } else if let Some(insn) =
2173 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
2174 {
2175 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2176 } else if let Some(insn) =
2177 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
2178 {
2179 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
2180 } else if let Some((_, insn)) = maybe_input_insn_multi(
2181 ctx,
2182 inputs[0],
2183 &[
2184 Opcode::Uload8,
2185 Opcode::Sload8,
2186 Opcode::Uload16,
2187 Opcode::Sload16,
2188 Opcode::Uload32,
2189 Opcode::Sload32,
2190 Opcode::Load,
2191 ],
2192 ) {
2193 ctx.sink_inst(insn);
2194 let load_inputs = insn_inputs(ctx, insn);
2195 let load_outputs = insn_outputs(ctx, insn);
2196 lower_load(
2197 ctx,
2198 insn,
2199 &load_inputs[..],
2200 load_outputs[0],
2201 |ctx, _rd, _elem_ty, mem| {
2202 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
2203 let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
2204 if let Some(addr_inst) = addr_inst {
2205 ctx.emit(addr_inst);
2206 }
2207 ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
2208 },
2209 );
2210 } else {
2211 let input_ty = ctx.input_ty(insn, 0);
2212 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2213 let inst = if ty_has_int_representation(input_ty) {
2214 Inst::VecDup { rd, rn, size }
2215 } else {
2216 Inst::VecDupFromFpu { rd, rn, size }
2217 };
2218
2219 ctx.emit(inst);
2220 }
2221 }
2222
2223 Opcode::ScalarToVector => {
2224 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2225 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2226 let input_ty = ctx.input_ty(insn, 0);
2227 if (input_ty == I32 && ty.unwrap() == I32X4)
2228 || (input_ty == I64 && ty.unwrap() == I64X2)
2229 {
2230 ctx.emit(Inst::MovToFpu {
2231 rd,
2232 rn,
2233 size: ScalarSize::from_ty(input_ty),
2234 });
2235 } else {
2236 return Err(CodegenError::Unsupported(format!(
2237 "ScalarToVector: unsupported types {:?} -> {:?}",
2238 input_ty, ty
2239 )));
2240 }
2241 }
2242
2243 Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
2244 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2245 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2246 let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
2247
2248 // cmeq vtmp.2d, vm.2d, #0
2249 // addp dtmp, vtmp.2d
2250 // fcmp dtmp, dtmp
2251 // cset xd, eq
2252 //
2253 // Note that after the ADDP the value of the temporary register will
2254 // be either 0 when all input elements are true, i.e. non-zero, or a
2255 // NaN otherwise (either -1 or -2 when represented as an integer);
2256 // NaNs are the only floating-point numbers that compare unequal to
2257 // themselves.
2258
2259 ctx.emit(Inst::VecMisc {
2260 op: VecMisc2::Cmeq0,
2261 rd: tmp,
2262 rn: rm,
2263 size: VectorSize::Size64x2,
2264 });
2265 ctx.emit(Inst::VecRRPair {
2266 op: VecPairOp::Addp,
2267 rd: tmp,
2268 rn: tmp.to_reg(),
2269 });
2270 ctx.emit(Inst::FpuCmp64 {
2271 rn: tmp.to_reg(),
2272 rm: tmp.to_reg(),
2273 });
2274 materialize_bool_result(ctx, insn, rd, Cond::Eq);
2275 }
2276
2277 Opcode::VanyTrue | Opcode::VallTrue => {
2278 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2279 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2280 let src_ty = ctx.input_ty(insn, 0);
2281 let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
2282
2283 // This operation is implemented by using umaxp or uminv to
2284 // create a scalar value, which is then compared against zero.
2285 //
2286 // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
2287 // mov xm, vn.d[0]
2288 // cmp xm, #0
2289 // cset xm, ne
2290
2291 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
2292
2293 if op == Opcode::VanyTrue {
2294 ctx.emit(Inst::VecRRR {
2295 alu_op: VecALUOp::Umaxp,
2296 rd: tmp,
2297 rn: rm,
2298 rm: rm,
2299 size,
2300 });
2301 } else {
2302 ctx.emit(Inst::VecLanes {
2303 op: VecLanesOp::Uminv,
2304 rd: tmp,
2305 rn: rm,
2306 size,
2307 });
2308 };
2309
2310 ctx.emit(Inst::MovFromVec {
2311 rd,
2312 rn: tmp.to_reg(),
2313 idx: 0,
2314 size: VectorSize::Size64x2,
2315 });
2316
2317 ctx.emit(Inst::AluRRImm12 {
2318 alu_op: ALUOp::SubS64,
2319 rd: writable_zero_reg(),
2320 rn: rd.to_reg(),
2321 imm12: Imm12::zero(),
2322 });
2323
2324 materialize_bool_result(ctx, insn, rd, Cond::Ne);
2325 }
2326
2327 Opcode::VhighBits => {
2328 let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2329 let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2330 let ty = ctx.input_ty(insn, 0);
2331 // All three sequences use one integer temporary and two vector temporaries. The
2332 // shift is done early so as to give the register allocator the possibility of using
2333 // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
2334 // `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
2335 // derivation of these sequences. Alternative sequences are discussed in
2336 // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
2337 // used here.
2338 let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
2339 let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2340 let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2341 match ty {
2342 I8X16 => {
2343 // sshr tmp_v1.16b, src_v.16b, #7
2344 // mov tmp_r0, #0x0201
2345 // movk tmp_r0, #0x0804, lsl 16
2346 // movk tmp_r0, #0x2010, lsl 32
2347 // movk tmp_r0, #0x8040, lsl 48
2348 // dup tmp_v0.2d, tmp_r0
2349 // and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
2350 // ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
2351 // zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2352 // addv tmp_v0h, tmp_v0.8h
2353 // mov dst_r, tmp_v0.h[0]
2354 ctx.emit(Inst::VecShiftImm {
2355 op: VecShiftImmOp::Sshr,
2356 rd: tmp_v1,
2357 rn: src_v,
2358 size: VectorSize::Size8x16,
2359 imm: 7,
2360 });
2361 lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
2362 ctx.emit(Inst::VecRRR {
2363 alu_op: VecALUOp::And,
2364 rd: tmp_v1,
2365 rn: tmp_v1.to_reg(),
2366 rm: tmp_v0.to_reg(),
2367 size: VectorSize::Size8x16,
2368 });
2369 ctx.emit(Inst::VecExtract {
2370 rd: tmp_v0,
2371 rn: tmp_v1.to_reg(),
2372 rm: tmp_v1.to_reg(),
2373 imm4: 8,
2374 });
2375 ctx.emit(Inst::VecRRR {
2376 alu_op: VecALUOp::Zip1,
2377 rd: tmp_v0,
2378 rn: tmp_v1.to_reg(),
2379 rm: tmp_v0.to_reg(),
2380 size: VectorSize::Size8x16,
2381 });
2382 ctx.emit(Inst::VecLanes {
2383 op: VecLanesOp::Addv,
2384 rd: tmp_v0,
2385 rn: tmp_v0.to_reg(),
2386 size: VectorSize::Size16x8,
2387 });
2388 ctx.emit(Inst::MovFromVec {
2389 rd: dst_r,
2390 rn: tmp_v0.to_reg(),
2391 idx: 0,
2392 size: VectorSize::Size16x8,
2393 });
2394 }
2395 I16X8 => {
2396 // sshr tmp_v1.8h, src_v.8h, #15
2397 // mov tmp_r0, #0x1
2398 // movk tmp_r0, #0x2, lsl 16
2399 // movk tmp_r0, #0x4, lsl 32
2400 // movk tmp_r0, #0x8, lsl 48
2401 // dup tmp_v0.2d, tmp_r0
2402 // shl tmp_r0, tmp_r0, #4
2403 // mov tmp_v0.d[1], tmp_r0
2404 // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2405 // addv tmp_v0h, tmp_v0.8h
2406 // mov dst_r, tmp_v0.h[0]
2407 ctx.emit(Inst::VecShiftImm {
2408 op: VecShiftImmOp::Sshr,
2409 rd: tmp_v1,
2410 rn: src_v,
2411 size: VectorSize::Size16x8,
2412 imm: 15,
2413 });
2414 lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
2415 ctx.emit(Inst::VecDup {
2416 rd: tmp_v0,
2417 rn: tmp_r0.to_reg(),
2418 size: VectorSize::Size64x2,
2419 });
2420 ctx.emit(Inst::AluRRImmShift {
2421 alu_op: ALUOp::Lsl64,
2422 rd: tmp_r0,
2423 rn: tmp_r0.to_reg(),
2424 immshift: ImmShift { imm: 4 },
2425 });
2426 ctx.emit(Inst::MovToVec {
2427 rd: tmp_v0,
2428 rn: tmp_r0.to_reg(),
2429 idx: 1,
2430 size: VectorSize::Size64x2,
2431 });
2432 ctx.emit(Inst::VecRRR {
2433 alu_op: VecALUOp::And,
2434 rd: tmp_v0,
2435 rn: tmp_v1.to_reg(),
2436 rm: tmp_v0.to_reg(),
2437 size: VectorSize::Size8x16,
2438 });
2439 ctx.emit(Inst::VecLanes {
2440 op: VecLanesOp::Addv,
2441 rd: tmp_v0,
2442 rn: tmp_v0.to_reg(),
2443 size: VectorSize::Size16x8,
2444 });
2445 ctx.emit(Inst::MovFromVec {
2446 rd: dst_r,
2447 rn: tmp_v0.to_reg(),
2448 idx: 0,
2449 size: VectorSize::Size16x8,
2450 });
2451 }
2452 I32X4 => {
2453 // sshr tmp_v1.4s, src_v.4s, #31
2454 // mov tmp_r0, #0x1
2455 // movk tmp_r0, #0x2, lsl 32
2456 // dup tmp_v0.2d, tmp_r0
2457 // shl tmp_r0, tmp_r0, #2
2458 // mov tmp_v0.d[1], tmp_r0
2459 // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
2460 // addv tmp_v0s, tmp_v0.4s
2461 // mov dst_r, tmp_v0.s[0]
2462 ctx.emit(Inst::VecShiftImm {
2463 op: VecShiftImmOp::Sshr,
2464 rd: tmp_v1,
2465 rn: src_v,
2466 size: VectorSize::Size32x4,
2467 imm: 31,
2468 });
2469 lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
2470 ctx.emit(Inst::VecDup {
2471 rd: tmp_v0,
2472 rn: tmp_r0.to_reg(),
2473 size: VectorSize::Size64x2,
2474 });
2475 ctx.emit(Inst::AluRRImmShift {
2476 alu_op: ALUOp::Lsl64,
2477 rd: tmp_r0,
2478 rn: tmp_r0.to_reg(),
2479 immshift: ImmShift { imm: 2 },
2480 });
2481 ctx.emit(Inst::MovToVec {
2482 rd: tmp_v0,
2483 rn: tmp_r0.to_reg(),
2484 idx: 1,
2485 size: VectorSize::Size64x2,
2486 });
2487 ctx.emit(Inst::VecRRR {
2488 alu_op: VecALUOp::And,
2489 rd: tmp_v0,
2490 rn: tmp_v1.to_reg(),
2491 rm: tmp_v0.to_reg(),
2492 size: VectorSize::Size8x16,
2493 });
2494 ctx.emit(Inst::VecLanes {
2495 op: VecLanesOp::Addv,
2496 rd: tmp_v0,
2497 rn: tmp_v0.to_reg(),
2498 size: VectorSize::Size32x4,
2499 });
2500 ctx.emit(Inst::MovFromVec {
2501 rd: dst_r,
2502 rn: tmp_v0.to_reg(),
2503 idx: 0,
2504 size: VectorSize::Size32x4,
2505 });
2506 }
2507 I64X2 => {
2508 // mov dst_r, src_v.d[0]
2509 // mov tmp_r0, src_v.d[1]
2510 // lsr dst_r, dst_r, #63
2511 // lsr tmp_r0, tmp_r0, #63
2512 // add dst_r, dst_r, tmp_r0, lsl #1
2513 ctx.emit(Inst::MovFromVec {
2514 rd: dst_r,
2515 rn: src_v,
2516 idx: 0,
2517 size: VectorSize::Size64x2,
2518 });
2519 ctx.emit(Inst::MovFromVec {
2520 rd: tmp_r0,
2521 rn: src_v,
2522 idx: 1,
2523 size: VectorSize::Size64x2,
2524 });
2525 ctx.emit(Inst::AluRRImmShift {
2526 alu_op: ALUOp::Lsr64,
2527 rd: dst_r,
2528 rn: dst_r.to_reg(),
2529 immshift: ImmShift::maybe_from_u64(63).unwrap(),
2530 });
2531 ctx.emit(Inst::AluRRImmShift {
2532 alu_op: ALUOp::Lsr64,
2533 rd: tmp_r0,
2534 rn: tmp_r0.to_reg(),
2535 immshift: ImmShift::maybe_from_u64(63).unwrap(),
2536 });
2537 ctx.emit(Inst::AluRRRShift {
2538 alu_op: ALUOp::Add32,
2539 rd: dst_r,
2540 rn: dst_r.to_reg(),
2541 rm: tmp_r0.to_reg(),
2542 shiftop: ShiftOpAndAmt::new(
2543 ShiftOp::LSL,
2544 ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
2545 ),
2546 });
2547 }
2548 _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
2549 }
2550 }
2551
2552 Opcode::Shuffle => {
2553 let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
2554 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2555 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2556 let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2557 // 2 register table vector lookups require consecutive table registers;
2558 // we satisfy this constraint by hardcoding the usage of v29 and v30.
2559 let temp = writable_vreg(29);
2560 let temp2 = writable_vreg(30);
2561 let input_ty = ctx.input_ty(insn, 0);
2562 assert_eq!(input_ty, ctx.input_ty(insn, 1));
2563 // Make sure that both inputs are in virtual registers, since it is
2564 // not guaranteed that we can get them safely to the temporaries if
2565 // either is in a real register.
2566 let rn = ctx.ensure_in_vreg(rn, input_ty);
2567 let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
2568
2569 lower_constant_f128(ctx, rd, mask);
2570 ctx.emit(Inst::gen_move(temp, rn, input_ty));
2571 ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
2572 ctx.emit(Inst::VecTbl2 {
2573 rd,
2574 rn: temp.to_reg(),
2575 rn2: temp2.to_reg(),
2576 rm: rd.to_reg(),
2577 is_extension: false,
2578 });
2579 }
2580
2581 Opcode::Swizzle => {
2582 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2583 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2584 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2585
2586 ctx.emit(Inst::VecTbl {
2587 rd,
2588 rn,
2589 rm,
2590 is_extension: false,
2591 });
2592 }
2593
2594 Opcode::Isplit => {
2595 assert_eq!(
2596 ctx.input_ty(insn, 0),
2597 I128,
2598 "Isplit only implemented for i128's"
2599 );
2600 assert_eq!(ctx.output_ty(insn, 0), I64);
2601 assert_eq!(ctx.output_ty(insn, 1), I64);
2602
2603 let src_regs = put_input_in_regs(ctx, inputs[0]);
2604 let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2605 let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
2606
2607 ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
2608 ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
2609 }
2610
2611 Opcode::Iconcat => {
2612 assert_eq!(
2613 ctx.output_ty(insn, 0),
2614 I128,
2615 "Iconcat only implemented for i128's"
2616 );
2617 assert_eq!(ctx.input_ty(insn, 0), I64);
2618 assert_eq!(ctx.input_ty(insn, 1), I64);
2619
2620 let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2621 let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2622 let dst = get_output_reg(ctx, outputs[0]);
2623
2624 ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
2625 ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
2626 }
2627
2628 Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
2629 let alu_op = match op {
2630 Opcode::Umin => VecALUOp::Umin,
2631 Opcode::Imin => VecALUOp::Smin,
2632 Opcode::Umax => VecALUOp::Umax,
2633 Opcode::Imax => VecALUOp::Smax,
2634 _ => unreachable!(),
2635 };
2636 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2637 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2638 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2639 let ty = ty.unwrap();
2640 ctx.emit(Inst::VecRRR {
2641 alu_op,
2642 rd,
2643 rn,
2644 rm,
2645 size: VectorSize::from_ty(ty),
2646 });
2647 }
2648
2649 Opcode::IaddPairwise => {
2650 let ty = ty.unwrap();
2651 let lane_type = ty.lane_type();
2652 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2653
2654 let mut match_long_pair =
2655 |ext_low_op, ext_high_op| -> Option<(VecRRPairLongOp, regalloc::Reg)> {
2656 if let Some(lhs) = maybe_input_insn(ctx, inputs[0], ext_low_op) {
2657 if let Some(rhs) = maybe_input_insn(ctx, inputs[1], ext_high_op) {
2658 let lhs_inputs = insn_inputs(ctx, lhs);
2659 let rhs_inputs = insn_inputs(ctx, rhs);
2660 let low = put_input_in_reg(ctx, lhs_inputs[0], NarrowValueMode::None);
2661 let high = put_input_in_reg(ctx, rhs_inputs[0], NarrowValueMode::None);
2662 if low == high {
2663 match (lane_type, ext_low_op) {
2664 (I16, Opcode::SwidenLow) => {
2665 return Some((VecRRPairLongOp::Saddlp8, low))
2666 }
2667 (I32, Opcode::SwidenLow) => {
2668 return Some((VecRRPairLongOp::Saddlp16, low))
2669 }
2670 (I16, Opcode::UwidenLow) => {
2671 return Some((VecRRPairLongOp::Uaddlp8, low))
2672 }
2673 (I32, Opcode::UwidenLow) => {
2674 return Some((VecRRPairLongOp::Uaddlp16, low))
2675 }
2676 _ => (),
2677 };
2678 }
2679 }
2680 }
2681 None
2682 };
2683
2684 if let Some((op, rn)) = match_long_pair(Opcode::SwidenLow, Opcode::SwidenHigh) {
2685 ctx.emit(Inst::VecRRPairLong { op, rd, rn });
2686 } else if let Some((op, rn)) = match_long_pair(Opcode::UwidenLow, Opcode::UwidenHigh) {
2687 ctx.emit(Inst::VecRRPairLong { op, rd, rn });
2688 } else {
2689 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2690 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2691 ctx.emit(Inst::VecRRR {
2692 alu_op: VecALUOp::Addp,
2693 rd: rd,
2694 rn: rn,
2695 rm: rm,
2696 size: VectorSize::from_ty(ty),
2697 });
2698 }
2699 }
2700
2701 Opcode::WideningPairwiseDotProductS => {
2702 let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2703 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2704 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2705 let ty = ty.unwrap();
2706 if ty == I32X4 {
2707 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2708 // The args have type I16X8.
2709 // "y = i32x4.dot_i16x8_s(a, b)"
2710 // => smull tmp, a, b
2711 // smull2 y, a, b
2712 // addp y, tmp, y
2713 ctx.emit(Inst::VecRRRLong {
2714 alu_op: VecRRRLongOp::Smull16,
2715 rd: tmp,
2716 rn: r_a,
2717 rm: r_b,
2718 high_half: false,
2719 });
2720 ctx.emit(Inst::VecRRRLong {
2721 alu_op: VecRRRLongOp::Smull16,
2722 rd: r_y,
2723 rn: r_a,
2724 rm: r_b,
2725 high_half: true,
2726 });
2727 ctx.emit(Inst::VecRRR {
2728 alu_op: VecALUOp::Addp,
2729 rd: r_y,
2730 rn: tmp.to_reg(),
2731 rm: r_y.to_reg(),
2732 size: VectorSize::Size32x4,
2733 });
2734 } else {
2735 return Err(CodegenError::Unsupported(format!(
2736 "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
2737 ty
2738 )));
2739 }
2740 }
2741
2742 Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
2743 let ty = ty.unwrap();
2744 let bits = ty_bits(ty);
2745 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2746 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2747 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2748 if !ty.is_vector() {
2749 let fpu_op = match (op, bits) {
2750 (Opcode::Fadd, 32) => FPUOp2::Add32,
2751 (Opcode::Fadd, 64) => FPUOp2::Add64,
2752 (Opcode::Fsub, 32) => FPUOp2::Sub32,
2753 (Opcode::Fsub, 64) => FPUOp2::Sub64,
2754 (Opcode::Fmul, 32) => FPUOp2::Mul32,
2755 (Opcode::Fmul, 64) => FPUOp2::Mul64,
2756 (Opcode::Fdiv, 32) => FPUOp2::Div32,
2757 (Opcode::Fdiv, 64) => FPUOp2::Div64,
2758 (Opcode::Fmin, 32) => FPUOp2::Min32,
2759 (Opcode::Fmin, 64) => FPUOp2::Min64,
2760 (Opcode::Fmax, 32) => FPUOp2::Max32,
2761 (Opcode::Fmax, 64) => FPUOp2::Max64,
2762 _ => panic!("Unknown op/bits combination"),
2763 };
2764 ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
2765 } else {
2766 let alu_op = match op {
2767 Opcode::Fadd => VecALUOp::Fadd,
2768 Opcode::Fsub => VecALUOp::Fsub,
2769 Opcode::Fdiv => VecALUOp::Fdiv,
2770 Opcode::Fmax => VecALUOp::Fmax,
2771 Opcode::Fmin => VecALUOp::Fmin,
2772 Opcode::Fmul => VecALUOp::Fmul,
2773 _ => unreachable!(),
2774 };
2775
2776 ctx.emit(Inst::VecRRR {
2777 rd,
2778 rn,
2779 rm,
2780 alu_op,
2781 size: VectorSize::from_ty(ty),
2782 });
2783 }
2784 }
2785
2786 Opcode::FminPseudo | Opcode::FmaxPseudo => {
2787 let ty = ctx.input_ty(insn, 0);
2788 if ty == F32X4 || ty == F64X2 {
2789 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
2790 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
2791 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2792 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2793 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2794 // Since we're going to write the output register `r_dst` anyway, we might as
2795 // well first use it to hold the comparison result. This has the slightly unusual
2796 // effect that we modify the output register in the first instruction (`fcmgt`)
2797 // but read both the inputs again in the second instruction (`bsl`), which means
2798 // that the output register can't be either of the input registers. Regalloc
2799 // should handle this correctly, nevertheless.
2800 ctx.emit(Inst::VecRRR {
2801 alu_op: VecALUOp::Fcmgt,
2802 rd: r_dst,
2803 rn: if op == Opcode::FminPseudo { r_a } else { r_b },
2804 rm: if op == Opcode::FminPseudo { r_b } else { r_a },
2805 size: if ty == F32X4 {
2806 VectorSize::Size32x4
2807 } else {
2808 VectorSize::Size64x2
2809 },
2810 });
2811 ctx.emit(Inst::VecRRR {
2812 alu_op: VecALUOp::Bsl,
2813 rd: r_dst,
2814 rn: r_b,
2815 rm: r_a,
2816 size: VectorSize::Size8x16,
2817 });
2818 } else {
2819 panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
2820 }
2821 }
2822
2823 Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
2824 let ty = ty.unwrap();
2825 let bits = ty_bits(ty);
2826 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2827 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2828 if !ty.is_vector() {
2829 let fpu_op = match (op, bits) {
2830 (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
2831 (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
2832 (Opcode::Fneg, 32) => FPUOp1::Neg32,
2833 (Opcode::Fneg, 64) => FPUOp1::Neg64,
2834 (Opcode::Fabs, 32) => FPUOp1::Abs32,
2835 (Opcode::Fabs, 64) => FPUOp1::Abs64,
2836 (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
2837 (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
2838 (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
2839 (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
2840 _ => panic!("Unknown op/bits combination"),
2841 };
2842 ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
2843 } else {
2844 let op = match op {
2845 Opcode::Fabs => VecMisc2::Fabs,
2846 Opcode::Fneg => VecMisc2::Fneg,
2847 Opcode::Sqrt => VecMisc2::Fsqrt,
2848 _ => unimplemented!(),
2849 };
2850
2851 ctx.emit(Inst::VecMisc {
2852 op,
2853 rd,
2854 rn,
2855 size: VectorSize::from_ty(ty),
2856 });
2857 }
2858 }
2859
2860 Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
2861 let ty = ctx.output_ty(insn, 0);
2862 if !ty.is_vector() {
2863 let bits = ty_bits(ty);
2864 let op = match (op, bits) {
2865 (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2866 (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2867 (Opcode::Floor, 32) => FpuRoundMode::Minus32,
2868 (Opcode::Floor, 64) => FpuRoundMode::Minus64,
2869 (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2870 (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2871 (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2872 (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2873 _ => panic!("Unknown op/bits combination (scalar)"),
2874 };
2875 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2876 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2877 ctx.emit(Inst::FpuRound { op, rd, rn });
2878 } else {
2879 let (op, size) = match (op, ty) {
2880 (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
2881 (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
2882 (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
2883 (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
2884 (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
2885 (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
2886 (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
2887 (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
2888 _ => panic!("Unknown op/ty combination (vector){:?}", ty),
2889 };
2890 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2891 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2892 ctx.emit(Inst::VecMisc { op, rd, rn, size });
2893 }
2894 }
2895
2896 Opcode::Fma => {
2897 let bits = ty_bits(ctx.output_ty(insn, 0));
2898 let fpu_op = match bits {
2899 32 => FPUOp3::MAdd32,
2900 64 => FPUOp3::MAdd64,
2901 _ => panic!("Unknown op size"),
2902 };
2903 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2904 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2905 let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
2906 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2907 ctx.emit(Inst::FpuRRRR {
2908 fpu_op,
2909 rn,
2910 rm,
2911 ra,
2912 rd,
2913 });
2914 }
2915
2916 Opcode::Fcopysign => {
2917 // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
2918 //
2919 // This is a scalar Fcopysign.
2920 // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
2921 // In the latter case it still sets all bits except the lowest 32 to 0.
2922 //
2923 // mov vd, vn
2924 // ushr vtmp, vm, #63 / #31
2925 // sli vd, vtmp, #63 / #31
2926
2927 let ty = ctx.output_ty(insn, 0);
2928 let bits = ty_bits(ty) as u8;
2929 assert!(bits == 32 || bits == 64);
2930 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2931 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2932 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2933 let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
2934
2935 // Copy LHS to rd.
2936 ctx.emit(Inst::gen_move(rd, rn, ty));
2937
2938 // Copy the sign bit to the lowest bit in tmp.
2939 let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2940 ctx.emit(Inst::FpuRRI {
2941 fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
2942 rd: tmp,
2943 rn: rm,
2944 });
2945
2946 // Insert the bit from tmp into the sign bit of rd.
2947 let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
2948 ctx.emit(Inst::FpuRRI {
2949 fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
2950 rd,
2951 rn: tmp.to_reg(),
2952 });
2953 }
2954
2955 Opcode::FcvtToUint | Opcode::FcvtToSint => {
2956 let in_bits = ty_bits(ctx.input_ty(insn, 0));
2957 let out_bits = ty_bits(ctx.output_ty(insn, 0));
2958 let signed = op == Opcode::FcvtToSint;
2959 let op = match (signed, in_bits, out_bits) {
2960 (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
2961 (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
2962 (false, 32, 64) => FpuToIntOp::F32ToU64,
2963 (true, 32, 64) => FpuToIntOp::F32ToI64,
2964 (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
2965 (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
2966 (false, 64, 64) => FpuToIntOp::F64ToU64,
2967 (true, 64, 64) => FpuToIntOp::F64ToI64,
2968 _ => panic!("Unknown input/output-bits combination"),
2969 };
2970
2971 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2972 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
2973
2974 // First, check the output: it's important to carry the NaN conversion before the
2975 // in-bounds conversion, per wasm semantics.
2976
2977 // Check that the input is not a NaN.
2978 if in_bits == 32 {
2979 ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
2980 } else {
2981 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
2982 }
2983 let trap_code = TrapCode::BadConversionToInteger;
2984 ctx.emit(Inst::TrapIf {
2985 trap_code,
2986 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
2987 });
2988
2989 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
2990
2991 // Check that the input is in range, with "truncate towards zero" semantics. This means
2992 // we allow values that are slightly out of range:
2993 // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
2994 // can be represented), and strictly less than INT_MAX+1 (when this can be
2995 // represented).
2996 // - for unsigned conversions, we allow values strictly greater than -1, and strictly
2997 // less than UINT_MAX+1 (when this can be represented).
2998
2999 if in_bits == 32 {
3000 // From float32.
3001 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
3002 (true, 8) => (
3003 i8::min_value() as f32 - 1.,
3004 FloatCC::GreaterThan,
3005 i8::max_value() as f32 + 1.,
3006 ),
3007 (true, 16) => (
3008 i16::min_value() as f32 - 1.,
3009 FloatCC::GreaterThan,
3010 i16::max_value() as f32 + 1.,
3011 ),
3012 (true, 32) => (
3013 i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
3014 FloatCC::GreaterThanOrEqual,
3015 i32::max_value() as f32 + 1.,
3016 ),
3017 (true, 64) => (
3018 i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
3019 FloatCC::GreaterThanOrEqual,
3020 i64::max_value() as f32 + 1.,
3021 ),
3022 (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
3023 (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
3024 (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
3025 (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
3026 _ => panic!("Unknown input/output-bits combination"),
3027 };
3028
3029 // >= low_bound
3030 lower_constant_f32(ctx, tmp, low_bound);
3031 ctx.emit(Inst::FpuCmp32 {
3032 rn,
3033 rm: tmp.to_reg(),
3034 });
3035 let trap_code = TrapCode::IntegerOverflow;
3036 ctx.emit(Inst::TrapIf {
3037 trap_code,
3038 kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
3039 });
3040
3041 // <= high_bound
3042 lower_constant_f32(ctx, tmp, high_bound);
3043 ctx.emit(Inst::FpuCmp32 {
3044 rn,
3045 rm: tmp.to_reg(),
3046 });
3047 let trap_code = TrapCode::IntegerOverflow;
3048 ctx.emit(Inst::TrapIf {
3049 trap_code,
3050 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
3051 });
3052 } else {
3053 // From float64.
3054 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
3055 (true, 8) => (
3056 i8::min_value() as f64 - 1.,
3057 FloatCC::GreaterThan,
3058 i8::max_value() as f64 + 1.,
3059 ),
3060 (true, 16) => (
3061 i16::min_value() as f64 - 1.,
3062 FloatCC::GreaterThan,
3063 i16::max_value() as f64 + 1.,
3064 ),
3065 (true, 32) => (
3066 i32::min_value() as f64 - 1.,
3067 FloatCC::GreaterThan,
3068 i32::max_value() as f64 + 1.,
3069 ),
3070 (true, 64) => (
3071 i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
3072 FloatCC::GreaterThanOrEqual,
3073 i64::max_value() as f64 + 1.,
3074 ),
3075 (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
3076 (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
3077 (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
3078 (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
3079 _ => panic!("Unknown input/output-bits combination"),
3080 };
3081
3082 // >= low_bound
3083 lower_constant_f64(ctx, tmp, low_bound);
3084 ctx.emit(Inst::FpuCmp64 {
3085 rn,
3086 rm: tmp.to_reg(),
3087 });
3088 let trap_code = TrapCode::IntegerOverflow;
3089 ctx.emit(Inst::TrapIf {
3090 trap_code,
3091 kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
3092 });
3093
3094 // <= high_bound
3095 lower_constant_f64(ctx, tmp, high_bound);
3096 ctx.emit(Inst::FpuCmp64 {
3097 rn,
3098 rm: tmp.to_reg(),
3099 });
3100 let trap_code = TrapCode::IntegerOverflow;
3101 ctx.emit(Inst::TrapIf {
3102 trap_code,
3103 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
3104 });
3105 };
3106
3107 // Do the conversion.
3108 ctx.emit(Inst::FpuToInt { op, rd, rn });
3109 }
3110
3111 Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
3112 let ty = ty.unwrap();
3113 let signed = op == Opcode::FcvtFromSint;
3114 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3115
3116 if ty.is_vector() {
3117 let op = if signed {
3118 VecMisc2::Scvtf
3119 } else {
3120 VecMisc2::Ucvtf
3121 };
3122 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3123
3124 ctx.emit(Inst::VecMisc {
3125 op,
3126 rd,
3127 rn,
3128 size: VectorSize::from_ty(ty),
3129 });
3130 } else {
3131 let in_bits = ty_bits(ctx.input_ty(insn, 0));
3132 let out_bits = ty_bits(ty);
3133 let op = match (signed, in_bits, out_bits) {
3134 (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
3135 (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
3136 (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
3137 (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
3138 (false, 64, 32) => IntToFpuOp::U64ToF32,
3139 (true, 64, 32) => IntToFpuOp::I64ToF32,
3140 (false, 64, 64) => IntToFpuOp::U64ToF64,
3141 (true, 64, 64) => IntToFpuOp::I64ToF64,
3142 _ => panic!("Unknown input/output-bits combination"),
3143 };
3144 let narrow_mode = match (signed, in_bits) {
3145 (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
3146 (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
3147 (false, 64) => NarrowValueMode::ZeroExtend64,
3148 (true, 64) => NarrowValueMode::SignExtend64,
3149 _ => panic!("Unknown input size"),
3150 };
3151 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
3152 ctx.emit(Inst::IntToFpu { op, rd, rn });
3153 }
3154 }
3155
3156 Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
3157 let ty = ty.unwrap();
3158 let out_signed = op == Opcode::FcvtToSintSat;
3159 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3160 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3161
3162 if ty.is_vector() {
3163 let op = if out_signed {
3164 VecMisc2::Fcvtzs
3165 } else {
3166 VecMisc2::Fcvtzu
3167 };
3168
3169 ctx.emit(Inst::VecMisc {
3170 op,
3171 rd,
3172 rn,
3173 size: VectorSize::from_ty(ty),
3174 });
3175 } else {
3176 let in_ty = ctx.input_ty(insn, 0);
3177 let in_bits = ty_bits(in_ty);
3178 let out_bits = ty_bits(ty);
3179 // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
3180 // FMIN Vtmp2, Vin, Vtmp1
3181 // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
3182 // FMAX Vtmp2, Vtmp2, Vtmp1
3183 // (if signed) FIMM Vtmp1, 0
3184 // FCMP Vin, Vin
3185 // FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0
3186 // convert Rout, Vtmp2
3187
3188 assert!(in_bits == 32 || in_bits == 64);
3189 assert!(out_bits == 32 || out_bits == 64);
3190
3191 let min: f64 = match (out_bits, out_signed) {
3192 (32, true) => std::i32::MIN as f64,
3193 (32, false) => 0.0,
3194 (64, true) => std::i64::MIN as f64,
3195 (64, false) => 0.0,
3196 _ => unreachable!(),
3197 };
3198
3199 let max = match (out_bits, out_signed) {
3200 (32, true) => std::i32::MAX as f64,
3201 (32, false) => std::u32::MAX as f64,
3202 (64, true) => std::i64::MAX as f64,
3203 (64, false) => std::u64::MAX as f64,
3204 _ => unreachable!(),
3205 };
3206
3207 let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
3208 let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
3209
3210 if in_bits == 32 {
3211 lower_constant_f32(ctx, rtmp1, max as f32);
3212 } else {
3213 lower_constant_f64(ctx, rtmp1, max);
3214 }
3215 ctx.emit(Inst::FpuRRR {
3216 fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
3217 rd: rtmp2,
3218 rn: rn,
3219 rm: rtmp1.to_reg(),
3220 });
3221 if in_bits == 32 {
3222 lower_constant_f32(ctx, rtmp1, min as f32);
3223 } else {
3224 lower_constant_f64(ctx, rtmp1, min);
3225 }
3226 ctx.emit(Inst::FpuRRR {
3227 fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
3228 rd: rtmp2,
3229 rn: rtmp2.to_reg(),
3230 rm: rtmp1.to_reg(),
3231 });
3232 if out_signed {
3233 if in_bits == 32 {
3234 lower_constant_f32(ctx, rtmp1, 0.0);
3235 } else {
3236 lower_constant_f64(ctx, rtmp1, 0.0);
3237 }
3238 }
3239 if in_bits == 32 {
3240 ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
3241 ctx.emit(Inst::FpuCSel32 {
3242 rd: rtmp2,
3243 rn: rtmp1.to_reg(),
3244 rm: rtmp2.to_reg(),
3245 cond: Cond::Ne,
3246 });
3247 } else {
3248 ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
3249 ctx.emit(Inst::FpuCSel64 {
3250 rd: rtmp2,
3251 rn: rtmp1.to_reg(),
3252 rm: rtmp2.to_reg(),
3253 cond: Cond::Ne,
3254 });
3255 }
3256
3257 let cvt = match (in_bits, out_bits, out_signed) {
3258 (32, 32, false) => FpuToIntOp::F32ToU32,
3259 (32, 32, true) => FpuToIntOp::F32ToI32,
3260 (32, 64, false) => FpuToIntOp::F32ToU64,
3261 (32, 64, true) => FpuToIntOp::F32ToI64,
3262 (64, 32, false) => FpuToIntOp::F64ToU32,
3263 (64, 32, true) => FpuToIntOp::F64ToI32,
3264 (64, 64, false) => FpuToIntOp::F64ToU64,
3265 (64, 64, true) => FpuToIntOp::F64ToI64,
3266 _ => unreachable!(),
3267 };
3268 ctx.emit(Inst::FpuToInt {
3269 op: cvt,
3270 rd,
3271 rn: rtmp2.to_reg(),
3272 });
3273 }
3274 }
3275
3276 Opcode::IaddIfcout => {
3277 // This is a two-output instruction that is needed for the
3278 // legalizer's explicit heap-check sequence, among possible other
3279 // uses. Its second output is a flags output only ever meant to
3280 // check for overflow using the
3281 // `backend.unsigned_add_overflow_condition()` condition.
3282 //
3283 // Note that the CLIF validation will ensure that no flag-setting
3284 // operation comes between this IaddIfcout and its use (e.g., a
3285 // Trapif). Thus, we can rely on implicit communication through the
3286 // processor flags rather than explicitly generating flags into a
3287 // register. We simply use the variant of the add instruction that
3288 // sets flags (`adds`) here.
3289
3290 // Note that the second output (the flags) need not be generated,
3291 // because flags are never materialized into a register; the only
3292 // instructions that can use a value of type `iflags` or `fflags`
3293 // will look directly for the flags-producing instruction (which can
3294 // always be found, by construction) and merge it.
3295
3296 // Now handle the iadd as above, except use an AddS opcode that sets
3297 // flags.
3298 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3299 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3300 let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
3301 let ty = ty.unwrap();
3302 let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
3303 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
3304 }
3305
3306 Opcode::IaddImm
3307 | Opcode::ImulImm
3308 | Opcode::UdivImm
3309 | Opcode::SdivImm
3310 | Opcode::UremImm
3311 | Opcode::SremImm
3312 | Opcode::IrsubImm
3313 | Opcode::IaddCin
3314 | Opcode::IaddIfcin
3315 | Opcode::IaddCout
3316 | Opcode::IaddCarry
3317 | Opcode::IaddIfcarry
3318 | Opcode::IsubBin
3319 | Opcode::IsubIfbin
3320 | Opcode::IsubBout
3321 | Opcode::IsubIfbout
3322 | Opcode::IsubBorrow
3323 | Opcode::IsubIfborrow
3324 | Opcode::BandImm
3325 | Opcode::BorImm
3326 | Opcode::BxorImm
3327 | Opcode::RotlImm
3328 | Opcode::RotrImm
3329 | Opcode::IshlImm
3330 | Opcode::UshrImm
3331 | Opcode::SshrImm
3332 | Opcode::IcmpImm
3333 | Opcode::IfcmpImm => {
3334 panic!("ALU+imm and ALU+carry ops should not appear here!");
3335 }
3336
3337 #[cfg(feature = "x86")]
3338 Opcode::X86Udivmodx
3339 | Opcode::X86Sdivmodx
3340 | Opcode::X86Umulx
3341 | Opcode::X86Smulx
3342 | Opcode::X86Cvtt2si
3343 | Opcode::X86Fmin
3344 | Opcode::X86Fmax
3345 | Opcode::X86Push
3346 | Opcode::X86Pop
3347 | Opcode::X86Bsr
3348 | Opcode::X86Bsf
3349 | Opcode::X86Pblendw
3350 | Opcode::X86Pshufd
3351 | Opcode::X86Pshufb
3352 | Opcode::X86Pextr
3353 | Opcode::X86Pinsr
3354 | Opcode::X86Insertps
3355 | Opcode::X86Movsd
3356 | Opcode::X86Movlhps
3357 | Opcode::X86Palignr
3358 | Opcode::X86Psll
3359 | Opcode::X86Psrl
3360 | Opcode::X86Psra
3361 | Opcode::X86Ptest
3362 | Opcode::X86Pmaxs
3363 | Opcode::X86Pmaxu
3364 | Opcode::X86Pmins
3365 | Opcode::X86Pminu
3366 | Opcode::X86Pmullq
3367 | Opcode::X86Pmuludq
3368 | Opcode::X86Punpckh
3369 | Opcode::X86Punpckl
3370 | Opcode::X86Vcvtudq2ps
3371 | Opcode::X86ElfTlsGetAddr
3372 | Opcode::X86MachoTlsGetAddr => {
3373 panic!("x86-specific opcode in supposedly arch-neutral IR!");
3374 }
3375
3376 Opcode::DummySargT => unreachable!(),
3377
3378 Opcode::Iabs => {
3379 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3380 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3381 let ty = ty.unwrap();
3382 ctx.emit(Inst::VecMisc {
3383 op: VecMisc2::Abs,
3384 rd,
3385 rn,
3386 size: VectorSize::from_ty(ty),
3387 });
3388 }
3389 Opcode::AvgRound => {
3390 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3391 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3392 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3393 let ty = ty.unwrap();
3394 ctx.emit(Inst::VecRRR {
3395 alu_op: VecALUOp::Urhadd,
3396 rd,
3397 rn,
3398 rm,
3399 size: VectorSize::from_ty(ty),
3400 });
3401 }
3402
3403 Opcode::Snarrow | Opcode::Unarrow | Opcode::Uunarrow => {
3404 let nonzero_high_half = maybe_input_insn(ctx, inputs[1], Opcode::Vconst)
3405 .map_or(true, |insn| {
3406 const_param_to_u128(ctx, insn).expect("Invalid immediate bytes") != 0
3407 });
3408 let op = match (op, ty.unwrap().lane_type()) {
3409 (Opcode::Snarrow, I8) => VecRRNarrowOp::Sqxtn16,
3410 (Opcode::Snarrow, I16) => VecRRNarrowOp::Sqxtn32,
3411 (Opcode::Snarrow, I32) => VecRRNarrowOp::Sqxtn64,
3412 (Opcode::Unarrow, I8) => VecRRNarrowOp::Sqxtun16,
3413 (Opcode::Unarrow, I16) => VecRRNarrowOp::Sqxtun32,
3414 (Opcode::Unarrow, I32) => VecRRNarrowOp::Sqxtun64,
3415 (Opcode::Uunarrow, I8) => VecRRNarrowOp::Uqxtn16,
3416 (Opcode::Uunarrow, I16) => VecRRNarrowOp::Uqxtn32,
3417 (Opcode::Uunarrow, I32) => VecRRNarrowOp::Uqxtn64,
3418 (_, lane_type) => {
3419 return Err(CodegenError::Unsupported(format!(
3420 "Unsupported SIMD vector lane type: {:?}",
3421 lane_type
3422 )))
3423 }
3424 };
3425 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3426 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3427
3428 ctx.emit(Inst::VecRRNarrow {
3429 op,
3430 rd,
3431 rn,
3432 high_half: false,
3433 });
3434
3435 if nonzero_high_half {
3436 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3437
3438 ctx.emit(Inst::VecRRNarrow {
3439 op,
3440 rd,
3441 rn,
3442 high_half: true,
3443 });
3444 }
3445 }
3446
3447 Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
3448 let lane_type = ty.unwrap().lane_type();
3449 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3450 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3451 let (t, high_half) = match (lane_type, op) {
3452 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
3453 (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
3454 (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
3455 (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
3456 (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
3457 (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
3458 (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
3459 (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
3460 (I64, Opcode::SwidenLow) => (VecExtendOp::Sxtl32, false),
3461 (I64, Opcode::SwidenHigh) => (VecExtendOp::Sxtl32, true),
3462 (I64, Opcode::UwidenLow) => (VecExtendOp::Uxtl32, false),
3463 (I64, Opcode::UwidenHigh) => (VecExtendOp::Uxtl32, true),
3464 _ => {
3465 return Err(CodegenError::Unsupported(format!(
3466 "Unsupported SIMD vector lane type: {:?}",
3467 lane_type
3468 )));
3469 }
3470 };
3471
3472 ctx.emit(Inst::VecExtend {
3473 t,
3474 rd,
3475 rn,
3476 high_half,
3477 });
3478 }
3479
3480 Opcode::TlsValue => match flags.tls_model() {
3481 TlsModel::ElfGd => {
3482 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3483 let (name, _, _) = ctx.symbol_value(insn).unwrap();
3484 let symbol = name.clone();
3485 ctx.emit(Inst::ElfTlsGetAddr { symbol });
3486
3487 let x0 = xreg(0);
3488 ctx.emit(Inst::gen_move(dst, x0, I64));
3489 }
3490 _ => {
3491 todo!(
3492 "Unimplemented TLS model in AArch64 backend: {:?}",
3493 flags.tls_model()
3494 );
3495 }
3496 },
3497
3498 Opcode::SqmulRoundSat => {
3499 let ty = ty.unwrap();
3500
3501 if !ty.is_vector() || (ty.lane_type() != I16 && ty.lane_type() != I32) {
3502 return Err(CodegenError::Unsupported(format!(
3503 "Unsupported type: {:?}",
3504 ty
3505 )));
3506 }
3507
3508 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3509 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3510 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
3511
3512 ctx.emit(Inst::VecRRR {
3513 alu_op: VecALUOp::Sqrdmulh,
3514 rd,
3515 rn,
3516 rm,
3517 size: VectorSize::from_ty(ty),
3518 });
3519 }
3520
3521 Opcode::FcvtLowFromSint => {
3522 let ty = ty.unwrap();
3523
3524 if ty != F64X2 {
3525 return Err(CodegenError::Unsupported(format!(
3526 "FcvtLowFromSint: Unsupported type: {:?}",
3527 ty
3528 )));
3529 }
3530
3531 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3532 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3533
3534 ctx.emit(Inst::VecExtend {
3535 t: VecExtendOp::Sxtl32,
3536 rd,
3537 rn,
3538 high_half: false,
3539 });
3540 ctx.emit(Inst::VecMisc {
3541 op: VecMisc2::Scvtf,
3542 rd,
3543 rn: rd.to_reg(),
3544 size: VectorSize::Size64x2,
3545 });
3546 }
3547
3548 Opcode::FvpromoteLow => {
3549 debug_assert_eq!(ty.unwrap(), F64X2);
3550
3551 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3552 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3553
3554 ctx.emit(Inst::VecRRLong {
3555 op: VecRRLongOp::Fcvtl32,
3556 rd,
3557 rn,
3558 high_half: false,
3559 });
3560 }
3561
3562 Opcode::Fvdemote => {
3563 debug_assert_eq!(ty.unwrap(), F32X4);
3564
3565 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
3566 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
3567
3568 ctx.emit(Inst::VecRRNarrow {
3569 op: VecRRNarrowOp::Fcvtn64,
3570 rd,
3571 rn,
3572 high_half: false,
3573 });
3574 }
3575
3576 Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
3577 unimplemented!("lowering {}", op)
3578 }
3579 }
3580
3581 Ok(())
3582 }
3583
lower_branch<C: LowerCtx<I = Inst>>( ctx: &mut C, branches: &[IRInst], targets: &[MachLabel], ) -> CodegenResult<()>3584 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
3585 ctx: &mut C,
3586 branches: &[IRInst],
3587 targets: &[MachLabel],
3588 ) -> CodegenResult<()> {
3589 // A block should end with at most two branches. The first may be a
3590 // conditional branch; a conditional branch can be followed only by an
3591 // unconditional branch or fallthrough. Otherwise, if only one branch,
3592 // it may be an unconditional branch, a fallthrough, a return, or a
3593 // trap. These conditions are verified by `is_ebb_basic()` during the
3594 // verifier pass.
3595 assert!(branches.len() <= 2);
3596
3597 if branches.len() == 2 {
3598 // Must be a conditional branch followed by an unconditional branch.
3599 let op0 = ctx.data(branches[0]).opcode();
3600 let op1 = ctx.data(branches[1]).opcode();
3601
3602 assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
3603 let taken = BranchTarget::Label(targets[0]);
3604 // not_taken target is the target of the second branch, even if it is a Fallthrough
3605 // instruction: because we reorder blocks while we lower, the fallthrough in the new
3606 // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
3607 // explicitly-provided target.
3608 let not_taken = BranchTarget::Label(targets[1]);
3609
3610 match op0 {
3611 Opcode::Brz | Opcode::Brnz => {
3612 let ty = ctx.input_ty(branches[0], 0);
3613 let flag_input = InsnInput {
3614 insn: branches[0],
3615 input: 0,
3616 };
3617 if let Some(icmp_insn) =
3618 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
3619 {
3620 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
3621 let cond =
3622 lower_icmp(ctx, icmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
3623 let negated = op0 == Opcode::Brz;
3624 let cond = if negated { cond.invert() } else { cond };
3625
3626 ctx.emit(Inst::CondBr {
3627 taken,
3628 not_taken,
3629 kind: CondBrKind::Cond(cond),
3630 });
3631 } else if let Some(fcmp_insn) =
3632 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
3633 {
3634 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
3635 let cond = lower_fp_condcode(condcode);
3636 let negated = op0 == Opcode::Brz;
3637 let cond = if negated { cond.invert() } else { cond };
3638
3639 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
3640 ctx.emit(Inst::CondBr {
3641 taken,
3642 not_taken,
3643 kind: CondBrKind::Cond(cond),
3644 });
3645 } else {
3646 let rt = if ty == I128 {
3647 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
3648 let input = put_input_in_regs(ctx, flag_input);
3649 ctx.emit(Inst::AluRRR {
3650 alu_op: ALUOp::Orr64,
3651 rd: tmp,
3652 rn: input.regs()[0],
3653 rm: input.regs()[1],
3654 });
3655 tmp.to_reg()
3656 } else {
3657 put_input_in_reg(ctx, flag_input, NarrowValueMode::ZeroExtend64)
3658 };
3659 let kind = match op0 {
3660 Opcode::Brz => CondBrKind::Zero(rt),
3661 Opcode::Brnz => CondBrKind::NotZero(rt),
3662 _ => unreachable!(),
3663 };
3664 ctx.emit(Inst::CondBr {
3665 taken,
3666 not_taken,
3667 kind,
3668 });
3669 }
3670 }
3671 Opcode::BrIcmp => {
3672 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3673 let cond =
3674 lower_icmp(ctx, branches[0], condcode, IcmpOutput::CondCode)?.unwrap_cond();
3675
3676 ctx.emit(Inst::CondBr {
3677 taken,
3678 not_taken,
3679 kind: CondBrKind::Cond(cond),
3680 });
3681 }
3682
3683 Opcode::Brif => {
3684 let condcode = ctx.data(branches[0]).cond_code().unwrap();
3685
3686 let flag_input = InsnInput {
3687 insn: branches[0],
3688 input: 0,
3689 };
3690 if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
3691 let cond =
3692 lower_icmp(ctx, ifcmp_insn, condcode, IcmpOutput::CondCode)?.unwrap_cond();
3693 ctx.emit(Inst::CondBr {
3694 taken,
3695 not_taken,
3696 kind: CondBrKind::Cond(cond),
3697 });
3698 } else {
3699 // If the ifcmp result is actually placed in a
3700 // register, we need to move it back into the flags.
3701 let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3702 ctx.emit(Inst::MovToNZCV { rn });
3703 ctx.emit(Inst::CondBr {
3704 taken,
3705 not_taken,
3706 kind: CondBrKind::Cond(lower_condcode(condcode)),
3707 });
3708 }
3709 }
3710
3711 Opcode::Brff => {
3712 let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
3713 let cond = lower_fp_condcode(condcode);
3714 let kind = CondBrKind::Cond(cond);
3715 let flag_input = InsnInput {
3716 insn: branches[0],
3717 input: 0,
3718 };
3719 if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
3720 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
3721 ctx.emit(Inst::CondBr {
3722 taken,
3723 not_taken,
3724 kind,
3725 });
3726 } else {
3727 // If the ffcmp result is actually placed in a
3728 // register, we need to move it back into the flags.
3729 let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
3730 ctx.emit(Inst::MovToNZCV { rn });
3731 ctx.emit(Inst::CondBr {
3732 taken,
3733 not_taken,
3734 kind,
3735 });
3736 }
3737 }
3738
3739 _ => unimplemented!(),
3740 }
3741 } else {
3742 // Must be an unconditional branch or an indirect branch.
3743 let op = ctx.data(branches[0]).opcode();
3744 match op {
3745 Opcode::Jump | Opcode::Fallthrough => {
3746 assert!(branches.len() == 1);
3747 // In the Fallthrough case, the machine-independent driver
3748 // fills in `targets[0]` with our fallthrough block, so this
3749 // is valid for both Jump and Fallthrough.
3750 ctx.emit(Inst::Jump {
3751 dest: BranchTarget::Label(targets[0]),
3752 });
3753 }
3754
3755 Opcode::BrTable => {
3756 // Expand `br_table index, default, JT` to:
3757 //
3758 // emit_island // this forces an island at this point
3759 // // if the jumptable would push us past
3760 // // the deadline
3761 // subs idx, #jt_size
3762 // b.hs default
3763 // adr vTmp1, PC+16
3764 // ldr vTmp2, [vTmp1, idx, lsl #2]
3765 // add vTmp2, vTmp2, vTmp1
3766 // br vTmp2
3767 // [jumptable offsets relative to JT base]
3768 let jt_size = targets.len() - 1;
3769 assert!(jt_size <= std::u32::MAX as usize);
3770
3771 ctx.emit(Inst::EmitIsland {
3772 needed_space: 4 * (6 + jt_size) as CodeOffset,
3773 });
3774
3775 let ridx = put_input_in_reg(
3776 ctx,
3777 InsnInput {
3778 insn: branches[0],
3779 input: 0,
3780 },
3781 NarrowValueMode::ZeroExtend32,
3782 );
3783
3784 let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
3785 let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
3786
3787 // Bounds-check, leaving condition codes for JTSequence's
3788 // branch to default target below.
3789 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
3790 ctx.emit(Inst::AluRRImm12 {
3791 alu_op: ALUOp::SubS32,
3792 rd: writable_zero_reg(),
3793 rn: ridx,
3794 imm12,
3795 });
3796 } else {
3797 lower_constant_u64(ctx, rtmp1, jt_size as u64);
3798 ctx.emit(Inst::AluRRR {
3799 alu_op: ALUOp::SubS32,
3800 rd: writable_zero_reg(),
3801 rn: ridx,
3802 rm: rtmp1.to_reg(),
3803 });
3804 }
3805
3806 // Emit the compound instruction that does:
3807 //
3808 // b.hs default
3809 // adr rA, jt
3810 // ldrsw rB, [rA, rIndex, UXTW 2]
3811 // add rA, rA, rB
3812 // br rA
3813 // [jt entries]
3814 //
3815 // This must be *one* instruction in the vcode because
3816 // we cannot allow regalloc to insert any spills/fills
3817 // in the middle of the sequence; otherwise, the ADR's
3818 // PC-rel offset to the jumptable would be incorrect.
3819 // (The alternative is to introduce a relocation pass
3820 // for inlined jumptables, which is much worse, IMHO.)
3821
3822 let jt_targets: Vec<BranchTarget> = targets
3823 .iter()
3824 .skip(1)
3825 .map(|bix| BranchTarget::Label(*bix))
3826 .collect();
3827 let default_target = BranchTarget::Label(targets[0]);
3828 let targets_for_term: Vec<MachLabel> = targets.to_vec();
3829 ctx.emit(Inst::JTSequence {
3830 ridx,
3831 rtmp1,
3832 rtmp2,
3833 info: Box::new(JTSequenceInfo {
3834 targets: jt_targets,
3835 default_target,
3836 targets_for_term,
3837 }),
3838 });
3839 }
3840
3841 _ => panic!("Unknown branch type!"),
3842 }
3843 }
3844
3845 Ok(())
3846 }
3847