1 use crate::cdsl::ast::{constant, var, ExprBuilder, Literal};
2 use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
3 use crate::cdsl::types::{LaneType, ValueType};
4 use crate::cdsl::xform::TransformGroupBuilder;
5 use crate::shared::types::Float::{F32, F64};
6 use crate::shared::types::Int::{I16, I32, I64, I8};
7 use crate::shared::Definitions as SharedDefinitions;
8
9 #[allow(clippy::many_single_char_names)]
define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup)10 pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
11 let mut expand = TransformGroupBuilder::new(
12 "x86_expand",
13 r#"
14 Legalize instructions by expansion.
15
16 Use x86-specific instructions if needed."#,
17 )
18 .isa("x86")
19 .chain_with(shared.transform_groups.by_name("expand_flags").id);
20
21 let mut narrow = TransformGroupBuilder::new(
22 "x86_narrow",
23 r#"
24 Legalize instructions by narrowing.
25
26 Use x86-specific instructions if needed."#,
27 )
28 .isa("x86")
29 .chain_with(shared.transform_groups.by_name("narrow_flags").id);
30
31 let mut narrow_avx = TransformGroupBuilder::new(
32 "x86_narrow_avx",
33 r#"
34 Legalize instructions by narrowing with CPU feature checks.
35
36 This special case converts using x86 AVX instructions where available."#,
37 )
38 .isa("x86");
39 // We cannot chain with the x86_narrow group until this group is built, see bottom of this
40 // function for where this is chained.
41
42 let mut widen = TransformGroupBuilder::new(
43 "x86_widen",
44 r#"
45 Legalize instructions by widening.
46
47 Use x86-specific instructions if needed."#,
48 )
49 .isa("x86")
50 .chain_with(shared.transform_groups.by_name("widen").id);
51
52 // List of instructions.
53 let insts = &shared.instructions;
54 let band = insts.by_name("band");
55 let bor = insts.by_name("bor");
56 let clz = insts.by_name("clz");
57 let ctz = insts.by_name("ctz");
58 let fcmp = insts.by_name("fcmp");
59 let fcvt_from_uint = insts.by_name("fcvt_from_uint");
60 let fcvt_to_sint = insts.by_name("fcvt_to_sint");
61 let fcvt_to_uint = insts.by_name("fcvt_to_uint");
62 let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
63 let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
64 let fmax = insts.by_name("fmax");
65 let fmin = insts.by_name("fmin");
66 let iadd = insts.by_name("iadd");
67 let iconst = insts.by_name("iconst");
68 let imul = insts.by_name("imul");
69 let ineg = insts.by_name("ineg");
70 let isub = insts.by_name("isub");
71 let ishl = insts.by_name("ishl");
72 let ireduce = insts.by_name("ireduce");
73 let popcnt = insts.by_name("popcnt");
74 let sdiv = insts.by_name("sdiv");
75 let selectif = insts.by_name("selectif");
76 let smulhi = insts.by_name("smulhi");
77 let srem = insts.by_name("srem");
78 let tls_value = insts.by_name("tls_value");
79 let udiv = insts.by_name("udiv");
80 let umulhi = insts.by_name("umulhi");
81 let ushr = insts.by_name("ushr");
82 let ushr_imm = insts.by_name("ushr_imm");
83 let urem = insts.by_name("urem");
84
85 let x86_bsf = x86_instructions.by_name("x86_bsf");
86 let x86_bsr = x86_instructions.by_name("x86_bsr");
87 let x86_umulx = x86_instructions.by_name("x86_umulx");
88 let x86_smulx = x86_instructions.by_name("x86_smulx");
89
90 let imm = &shared.imm;
91
92 // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
93 // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
94 // not encodable.
95 let a = var("a");
96 let x = var("x");
97 let y = var("y");
98 let z = var("z");
99
100 for &ty in &[I8, I16, I32] {
101 let ishl_by_i64 = ishl.bind(ty).bind(I64);
102 let ireduce = ireduce.bind(I32);
103 expand.legalize(
104 def!(a = ishl_by_i64(x, y)),
105 vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
106 );
107 }
108
109 for &ty in &[I8, I16, I32] {
110 let ushr_by_i64 = ushr.bind(ty).bind(I64);
111 let ireduce = ireduce.bind(I32);
112 expand.legalize(
113 def!(a = ushr_by_i64(x, y)),
114 vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
115 );
116 }
117
118 // Division and remainder.
119 //
120 // The srem expansion requires custom code because srem INT_MIN, -1 is not
121 // allowed to trap. The other ops need to check avoid_div_traps.
122 expand.custom_legalize(sdiv, "expand_sdivrem");
123 expand.custom_legalize(srem, "expand_sdivrem");
124 expand.custom_legalize(udiv, "expand_udivrem");
125 expand.custom_legalize(urem, "expand_udivrem");
126
127 // Double length (widening) multiplication.
128 let a = var("a");
129 let x = var("x");
130 let y = var("y");
131 let a1 = var("a1");
132 let a2 = var("a2");
133 let res_lo = var("res_lo");
134 let res_hi = var("res_hi");
135
136 expand.legalize(
137 def!(res_hi = umulhi(x, y)),
138 vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
139 );
140
141 expand.legalize(
142 def!(res_hi = smulhi(x, y)),
143 vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
144 );
145
146 // Floating point condition codes.
147 //
148 // The 8 condition codes in `supported_floatccs` are directly supported by a
149 // `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
150 // patterns.
151
152 let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq");
153 let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord");
154 let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq");
155 let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne");
156 let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno");
157 let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");
158
159 // Equality needs an explicit `ord` test which checks the parity bit.
160 expand.legalize(
161 def!(a = fcmp(floatcc_eq, x, y)),
162 vec![
163 def!(a1 = fcmp(floatcc_ord, x, y)),
164 def!(a2 = fcmp(floatcc_ueq, x, y)),
165 def!(a = band(a1, a2)),
166 ],
167 );
168 expand.legalize(
169 def!(a = fcmp(floatcc_ne, x, y)),
170 vec![
171 def!(a1 = fcmp(floatcc_uno, x, y)),
172 def!(a2 = fcmp(floatcc_one, x, y)),
173 def!(a = bor(a1, a2)),
174 ],
175 );
176
177 let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt");
178 let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt");
179 let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le");
180 let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge");
181 let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt");
182 let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult");
183 let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge");
184 let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule");
185
186 // Inequalities that need to be reversed.
187 for &(cc, rev_cc) in &[
188 (floatcc_lt, floatcc_gt),
189 (floatcc_le, floatcc_ge),
190 (floatcc_ugt, floatcc_ult),
191 (floatcc_uge, floatcc_ule),
192 ] {
193 expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
194 }
195
196 // We need to modify the CFG for min/max legalization.
197 expand.custom_legalize(fmin, "expand_minmax");
198 expand.custom_legalize(fmax, "expand_minmax");
199
200 // Conversions from unsigned need special handling.
201 expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
202 // Conversions from float to int can trap and modify the control flow graph.
203 expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
204 expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
205 expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
206 expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
207
208 // Count leading and trailing zeroes, for baseline x86_64
209 let c_minus_one = var("c_minus_one");
210 let c_thirty_one = var("c_thirty_one");
211 let c_thirty_two = var("c_thirty_two");
212 let c_sixty_three = var("c_sixty_three");
213 let c_sixty_four = var("c_sixty_four");
214 let index1 = var("index1");
215 let r2flags = var("r2flags");
216 let index2 = var("index2");
217
218 let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
219 let imm64_minus_one = Literal::constant(&imm.imm64, -1);
220 let imm64_63 = Literal::constant(&imm.imm64, 63);
221 expand.legalize(
222 def!(a = clz.I64(x)),
223 vec![
224 def!(c_minus_one = iconst(imm64_minus_one)),
225 def!(c_sixty_three = iconst(imm64_63)),
226 def!((index1, r2flags) = x86_bsr(x)),
227 def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
228 def!(a = isub(c_sixty_three, index2)),
229 ],
230 );
231
232 let imm64_31 = Literal::constant(&imm.imm64, 31);
233 expand.legalize(
234 def!(a = clz.I32(x)),
235 vec![
236 def!(c_minus_one = iconst(imm64_minus_one)),
237 def!(c_thirty_one = iconst(imm64_31)),
238 def!((index1, r2flags) = x86_bsr(x)),
239 def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
240 def!(a = isub(c_thirty_one, index2)),
241 ],
242 );
243
244 let imm64_64 = Literal::constant(&imm.imm64, 64);
245 expand.legalize(
246 def!(a = ctz.I64(x)),
247 vec![
248 def!(c_sixty_four = iconst(imm64_64)),
249 def!((index1, r2flags) = x86_bsf(x)),
250 def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)),
251 ],
252 );
253
254 let imm64_32 = Literal::constant(&imm.imm64, 32);
255 expand.legalize(
256 def!(a = ctz.I32(x)),
257 vec![
258 def!(c_thirty_two = iconst(imm64_32)),
259 def!((index1, r2flags) = x86_bsf(x)),
260 def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)),
261 ],
262 );
263
264 // Population count for baseline x86_64
265 let x = var("x");
266 let r = var("r");
267
268 let qv3 = var("qv3");
269 let qv4 = var("qv4");
270 let qv5 = var("qv5");
271 let qv6 = var("qv6");
272 let qv7 = var("qv7");
273 let qv8 = var("qv8");
274 let qv9 = var("qv9");
275 let qv10 = var("qv10");
276 let qv11 = var("qv11");
277 let qv12 = var("qv12");
278 let qv13 = var("qv13");
279 let qv14 = var("qv14");
280 let qv15 = var("qv15");
281 let qc77 = var("qc77");
282 #[allow(non_snake_case)]
283 let qc0F = var("qc0F");
284 let qc01 = var("qc01");
285
286 let imm64_1 = Literal::constant(&imm.imm64, 1);
287 let imm64_4 = Literal::constant(&imm.imm64, 4);
288 expand.legalize(
289 def!(r = popcnt.I64(x)),
290 vec![
291 def!(qv3 = ushr_imm(x, imm64_1)),
292 def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777_7777_7777))),
293 def!(qv4 = band(qv3, qc77)),
294 def!(qv5 = isub(x, qv4)),
295 def!(qv6 = ushr_imm(qv4, imm64_1)),
296 def!(qv7 = band(qv6, qc77)),
297 def!(qv8 = isub(qv5, qv7)),
298 def!(qv9 = ushr_imm(qv7, imm64_1)),
299 def!(qv10 = band(qv9, qc77)),
300 def!(qv11 = isub(qv8, qv10)),
301 def!(qv12 = ushr_imm(qv11, imm64_4)),
302 def!(qv13 = iadd(qv11, qv12)),
303 def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F_0F0F_0F0F))),
304 def!(qv14 = band(qv13, qc0F)),
305 def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101_0101_0101))),
306 def!(qv15 = imul(qv14, qc01)),
307 def!(r = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))),
308 ],
309 );
310
311 let lv3 = var("lv3");
312 let lv4 = var("lv4");
313 let lv5 = var("lv5");
314 let lv6 = var("lv6");
315 let lv7 = var("lv7");
316 let lv8 = var("lv8");
317 let lv9 = var("lv9");
318 let lv10 = var("lv10");
319 let lv11 = var("lv11");
320 let lv12 = var("lv12");
321 let lv13 = var("lv13");
322 let lv14 = var("lv14");
323 let lv15 = var("lv15");
324 let lc77 = var("lc77");
325 #[allow(non_snake_case)]
326 let lc0F = var("lc0F");
327 let lc01 = var("lc01");
328
329 expand.legalize(
330 def!(r = popcnt.I32(x)),
331 vec![
332 def!(lv3 = ushr_imm(x, imm64_1)),
333 def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777))),
334 def!(lv4 = band(lv3, lc77)),
335 def!(lv5 = isub(x, lv4)),
336 def!(lv6 = ushr_imm(lv4, imm64_1)),
337 def!(lv7 = band(lv6, lc77)),
338 def!(lv8 = isub(lv5, lv7)),
339 def!(lv9 = ushr_imm(lv7, imm64_1)),
340 def!(lv10 = band(lv9, lc77)),
341 def!(lv11 = isub(lv8, lv10)),
342 def!(lv12 = ushr_imm(lv11, imm64_4)),
343 def!(lv13 = iadd(lv11, lv12)),
344 def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F))),
345 def!(lv14 = band(lv13, lc0F)),
346 def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101))),
347 def!(lv15 = imul(lv14, lc01)),
348 def!(r = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))),
349 ],
350 );
351
352 expand.custom_legalize(ineg, "convert_ineg");
353 expand.custom_legalize(tls_value, "expand_tls_value");
354 widen.custom_legalize(ineg, "convert_ineg");
355
356 // To reduce compilation times, separate out large blocks of legalizations by theme.
357 define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
358
359 expand.build_and_add_to(&mut shared.transform_groups);
360 let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
361 narrow_avx
362 .chain_with(narrow_id)
363 .build_and_add_to(&mut shared.transform_groups);
364 widen.build_and_add_to(&mut shared.transform_groups);
365 }
366
define_simd( shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup, narrow: &mut TransformGroupBuilder, narrow_avx: &mut TransformGroupBuilder, )367 fn define_simd(
368 shared: &mut SharedDefinitions,
369 x86_instructions: &InstructionGroup,
370 narrow: &mut TransformGroupBuilder,
371 narrow_avx: &mut TransformGroupBuilder,
372 ) {
373 let insts = &shared.instructions;
374 let band = insts.by_name("band");
375 let band_not = insts.by_name("band_not");
376 let bitcast = insts.by_name("bitcast");
377 let bitselect = insts.by_name("bitselect");
378 let bor = insts.by_name("bor");
379 let bnot = insts.by_name("bnot");
380 let bxor = insts.by_name("bxor");
381 let extractlane = insts.by_name("extractlane");
382 let fabs = insts.by_name("fabs");
383 let fcmp = insts.by_name("fcmp");
384 let fcvt_from_uint = insts.by_name("fcvt_from_uint");
385 let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
386 let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
387 let fmax = insts.by_name("fmax");
388 let fmin = insts.by_name("fmin");
389 let fneg = insts.by_name("fneg");
390 let iadd_imm = insts.by_name("iadd_imm");
391 let icmp = insts.by_name("icmp");
392 let imax = insts.by_name("imax");
393 let imin = insts.by_name("imin");
394 let imul = insts.by_name("imul");
395 let ineg = insts.by_name("ineg");
396 let insertlane = insts.by_name("insertlane");
397 let ishl = insts.by_name("ishl");
398 let ishl_imm = insts.by_name("ishl_imm");
399 let raw_bitcast = insts.by_name("raw_bitcast");
400 let scalar_to_vector = insts.by_name("scalar_to_vector");
401 let splat = insts.by_name("splat");
402 let shuffle = insts.by_name("shuffle");
403 let sshr = insts.by_name("sshr");
404 let swizzle = insts.by_name("swizzle");
405 let trueif = insts.by_name("trueif");
406 let uadd_sat = insts.by_name("uadd_sat");
407 let umax = insts.by_name("umax");
408 let umin = insts.by_name("umin");
409 let snarrow = insts.by_name("snarrow");
410 let swiden_high = insts.by_name("swiden_high");
411 let swiden_low = insts.by_name("swiden_low");
412 let ushr_imm = insts.by_name("ushr_imm");
413 let ushr = insts.by_name("ushr");
414 let uwiden_high = insts.by_name("uwiden_high");
415 let uwiden_low = insts.by_name("uwiden_low");
416 let vconst = insts.by_name("vconst");
417 let vall_true = insts.by_name("vall_true");
418 let vany_true = insts.by_name("vany_true");
419 let vselect = insts.by_name("vselect");
420
421 let x86_palignr = x86_instructions.by_name("x86_palignr");
422 let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
423 let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
424 let x86_pmins = x86_instructions.by_name("x86_pmins");
425 let x86_pminu = x86_instructions.by_name("x86_pminu");
426 let x86_pshufb = x86_instructions.by_name("x86_pshufb");
427 let x86_pshufd = x86_instructions.by_name("x86_pshufd");
428 let x86_psra = x86_instructions.by_name("x86_psra");
429 let x86_ptest = x86_instructions.by_name("x86_ptest");
430 let x86_punpckh = x86_instructions.by_name("x86_punpckh");
431 let x86_punpckl = x86_instructions.by_name("x86_punpckl");
432
433 let imm = &shared.imm;
434
435 // Set up variables and immediates.
436 let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
437 let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
438 let uimm8_eight = Literal::constant(&imm.uimm8, 8);
439 let u128_zeroes = constant(vec![0x00; 16]);
440 let u128_ones = constant(vec![0xff; 16]);
441 let u128_seventies = constant(vec![0x70; 16]);
442 let a = var("a");
443 let b = var("b");
444 let c = var("c");
445 let d = var("d");
446 let e = var("e");
447 let f = var("f");
448 let g = var("g");
449 let h = var("h");
450 let x = var("x");
451 let y = var("y");
452 let z = var("z");
453
454 // Limit the SIMD vector size: eventually multiple vector sizes may be supported
455 // but for now only SSE-sized vectors are available.
456 let sse_vector_size: u64 = 128;
457 let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
458
459 // SIMD splat: 8-bits
460 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
461 let splat_any8x16 = splat.bind(vector(ty, sse_vector_size));
462 narrow.legalize(
463 def!(y = splat_any8x16(x)),
464 vec![
465 // Move into the lowest 8 bits of an XMM register.
466 def!(a = scalar_to_vector(x)),
467 // Zero out a different XMM register; the shuffle mask for moving the lowest byte
468 // to all other byte lanes is 0x0.
469 def!(b = vconst(u128_zeroes)),
470 // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b).
471 def!(y = x86_pshufb(a, b)),
472 ],
473 );
474 }
475
476 // SIMD splat: 16-bits
477 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
478 let splat_x16x8 = splat.bind(vector(ty, sse_vector_size));
479 let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
480 .bind(vector(I32, sse_vector_size))
481 .bind(vector(ty, sse_vector_size));
482 let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
483 .bind(vector(ty, sse_vector_size))
484 .bind(vector(I32, sse_vector_size));
485 narrow.legalize(
486 def!(y = splat_x16x8(x)),
487 vec![
488 // Move into the lowest 16 bits of an XMM register.
489 def!(a = scalar_to_vector(x)),
490 // Insert the value again but in the next lowest 16 bits.
491 def!(b = insertlane(a, x, uimm8_one)),
492 // No instruction emitted; pretend this is an I32x4 so we can use PSHUFD.
493 def!(c = raw_bitcast_any16x8_to_i32x4(b)),
494 // Broadcast the bytes in the XMM register with PSHUFD.
495 def!(d = x86_pshufd(c, uimm8_zero)),
496 // No instruction emitted; pretend this is an X16x8 again.
497 def!(y = raw_bitcast_i32x4_to_any16x8(d)),
498 ],
499 );
500 }
501
502 // SIMD splat: 32-bits
503 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
504 let splat_any32x4 = splat.bind(vector(ty, sse_vector_size));
505 narrow.legalize(
506 def!(y = splat_any32x4(x)),
507 vec![
508 // Translate to an x86 MOV to get the value in an XMM register.
509 def!(a = scalar_to_vector(x)),
510 // Broadcast the bytes in the XMM register with PSHUFD.
511 def!(y = x86_pshufd(a, uimm8_zero)),
512 ],
513 );
514 }
515
516 // SIMD splat: 64-bits
517 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
518 let splat_any64x2 = splat.bind(vector(ty, sse_vector_size));
519 narrow.legalize(
520 def!(y = splat_any64x2(x)),
521 vec![
522 // Move into the lowest 64 bits of an XMM register.
523 def!(a = scalar_to_vector(x)),
524 // Move into the highest 64 bits of the same XMM register.
525 def!(y = insertlane(a, x, uimm8_one)),
526 ],
527 );
528 }
529
530 // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring
531 // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion,
532 // see https://github.com/WebAssembly/simd/issues/93.
533 {
534 let swizzle = swizzle.bind(vector(I8, sse_vector_size));
535 narrow.legalize(
536 def!(a = swizzle(x, y)),
537 vec![
538 def!(b = vconst(u128_seventies)),
539 def!(c = uadd_sat(y, b)),
540 def!(a = x86_pshufb(x, c)),
541 ],
542 );
543 }
544
545 // SIMD bnot
546 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
547 let bnot = bnot.bind(vector(ty, sse_vector_size));
548 narrow.legalize(
549 def!(y = bnot(x)),
550 vec![def!(a = vconst(u128_ones)), def!(y = bxor(a, x))],
551 );
552 }
553
554 // SIMD shift right (arithmetic, i16x8 and i32x4)
555 for ty in &[I16, I32] {
556 let sshr = sshr.bind(vector(*ty, sse_vector_size));
557 let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
558 narrow.legalize(
559 def!(a = sshr(x, y)),
560 vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
561 );
562 }
563 // SIMD shift right (arithmetic, i8x16)
564 {
565 let sshr = sshr.bind(vector(I8, sse_vector_size));
566 let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
567 let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
568 let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
569 narrow.legalize(
570 def!(z = sshr(x, y)),
571 vec![
572 // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
573 def!(a = iadd_imm(y, uimm8_eight)),
574 def!(b = bitcast_i64x2(a)),
575 // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
576 def!(c = x86_punpckl(x, x)),
577 def!(d = raw_bitcast_i16x8(c)),
578 def!(e = x86_psra(d, b)),
579 // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
580 def!(f = x86_punpckh(x, x)),
581 def!(g = raw_bitcast_i16x8_again(f)),
582 def!(h = x86_psra(g, b)),
583 // Re-pack the vector.
584 def!(z = snarrow(e, h)),
585 ],
586 );
587 }
588 // SIMD shift right (arithmetic, i64x2)
589 {
590 let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
591 let sshr_scalar_lane0 = sshr.bind(I64);
592 let sshr_scalar_lane1 = sshr.bind(I64);
593 narrow.legalize(
594 def!(z = sshr_vector(x, y)),
595 vec![
596 // Use scalar operations to shift the first lane.
597 def!(a = extractlane(x, uimm8_zero)),
598 def!(b = sshr_scalar_lane0(a, y)),
599 def!(c = insertlane(x, b, uimm8_zero)),
600 // Do the same for the second lane.
601 def!(d = extractlane(x, uimm8_one)),
602 def!(e = sshr_scalar_lane1(d, y)),
603 def!(z = insertlane(c, e, uimm8_one)),
604 ],
605 );
606 }
607
608 // SIMD select
609 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
610 let bitselect = bitselect.bind(vector(ty, sse_vector_size)); // must bind both x/y and c
611 narrow.legalize(
612 def!(d = bitselect(c, x, y)),
613 vec![
614 def!(a = band(x, c)),
615 def!(b = band_not(y, c)),
616 def!(d = bor(a, b)),
617 ],
618 );
619 }
620
621 // SIMD vselect; replace with bitselect if BLEND* instructions are not available.
622 // This works, because each lane of boolean vector is filled with zeroes or ones.
623 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
624 let vselect = vselect.bind(vector(ty, sse_vector_size));
625 let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
626 narrow.legalize(
627 def!(d = vselect(c, x, y)),
628 vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
629 );
630 }
631
632 // SIMD vany_true
633 let ne = Literal::enumerator_for(&imm.intcc, "ne");
634 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
635 let vany_true = vany_true.bind(vector(ty, sse_vector_size));
636 narrow.legalize(
637 def!(y = vany_true(x)),
638 vec![def!(a = x86_ptest(x, x)), def!(y = trueif(ne, a))],
639 );
640 }
641
642 // SIMD vall_true
643 let eq = Literal::enumerator_for(&imm.intcc, "eq");
644 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
645 let vall_true = vall_true.bind(vector(ty, sse_vector_size));
646 if ty.is_int() {
647 // In the common case (Wasm's integer-only all_true), we do not require a
648 // bitcast.
649 narrow.legalize(
650 def!(y = vall_true(x)),
651 vec![
652 def!(a = vconst(u128_zeroes)),
653 def!(c = icmp(eq, x, a)),
654 def!(d = x86_ptest(c, c)),
655 def!(y = trueif(eq, d)),
656 ],
657 );
658 } else {
659 // However, to support other types we must bitcast them to an integer vector to
660 // use icmp.
661 let lane_type_as_int = LaneType::int_from_bits(ty.lane_bits() as u16);
662 let raw_bitcast_to_int = raw_bitcast.bind(vector(lane_type_as_int, sse_vector_size));
663 narrow.legalize(
664 def!(y = vall_true(x)),
665 vec![
666 def!(a = vconst(u128_zeroes)),
667 def!(b = raw_bitcast_to_int(x)),
668 def!(c = icmp(eq, b, a)),
669 def!(d = x86_ptest(c, c)),
670 def!(y = trueif(eq, d)),
671 ],
672 );
673 }
674 }
675
676 // SIMD icmp ne
677 let ne = Literal::enumerator_for(&imm.intcc, "ne");
678 for ty in ValueType::all_lane_types().filter(|ty| allowed_simd_type(ty) && ty.is_int()) {
679 let icmp_ = icmp.bind(vector(ty, sse_vector_size));
680 narrow.legalize(
681 def!(c = icmp_(ne, a, b)),
682 vec![def!(x = icmp(eq, a, b)), def!(c = bnot(x))],
683 );
684 }
685
686 // SIMD icmp greater-/less-than
687 let sgt = Literal::enumerator_for(&imm.intcc, "sgt");
688 let ugt = Literal::enumerator_for(&imm.intcc, "ugt");
689 let sge = Literal::enumerator_for(&imm.intcc, "sge");
690 let uge = Literal::enumerator_for(&imm.intcc, "uge");
691 let slt = Literal::enumerator_for(&imm.intcc, "slt");
692 let ult = Literal::enumerator_for(&imm.intcc, "ult");
693 let sle = Literal::enumerator_for(&imm.intcc, "sle");
694 let ule = Literal::enumerator_for(&imm.intcc, "ule");
695 for ty in &[I8, I16, I32] {
696 // greater-than
697 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
698 narrow.legalize(
699 def!(c = icmp_(ugt, a, b)),
700 vec![
701 def!(x = x86_pmaxu(a, b)),
702 def!(y = icmp(eq, x, b)),
703 def!(c = bnot(y)),
704 ],
705 );
706 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
707 narrow.legalize(
708 def!(c = icmp_(sge, a, b)),
709 vec![def!(x = x86_pmins(a, b)), def!(c = icmp(eq, x, b))],
710 );
711 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
712 narrow.legalize(
713 def!(c = icmp_(uge, a, b)),
714 vec![def!(x = x86_pminu(a, b)), def!(c = icmp(eq, x, b))],
715 );
716
717 // less-than
718 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
719 narrow.legalize(def!(c = icmp_(slt, a, b)), vec![def!(c = icmp(sgt, b, a))]);
720 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
721 narrow.legalize(def!(c = icmp_(ult, a, b)), vec![def!(c = icmp(ugt, b, a))]);
722 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
723 narrow.legalize(def!(c = icmp_(sle, a, b)), vec![def!(c = icmp(sge, b, a))]);
724 let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
725 narrow.legalize(def!(c = icmp_(ule, a, b)), vec![def!(c = icmp(uge, b, a))]);
726 }
727
728 // SIMD integer min/max
729 for ty in &[I8, I16, I32] {
730 let imin = imin.bind(vector(*ty, sse_vector_size));
731 narrow.legalize(def!(c = imin(a, b)), vec![def!(c = x86_pmins(a, b))]);
732 let umin = umin.bind(vector(*ty, sse_vector_size));
733 narrow.legalize(def!(c = umin(a, b)), vec![def!(c = x86_pminu(a, b))]);
734 let imax = imax.bind(vector(*ty, sse_vector_size));
735 narrow.legalize(def!(c = imax(a, b)), vec![def!(c = x86_pmaxs(a, b))]);
736 let umax = umax.bind(vector(*ty, sse_vector_size));
737 narrow.legalize(def!(c = umax(a, b)), vec![def!(c = x86_pmaxu(a, b))]);
738 }
739
740 // SIMD fcmp greater-/less-than
741 let gt = Literal::enumerator_for(&imm.floatcc, "gt");
742 let lt = Literal::enumerator_for(&imm.floatcc, "lt");
743 let ge = Literal::enumerator_for(&imm.floatcc, "ge");
744 let le = Literal::enumerator_for(&imm.floatcc, "le");
745 let ugt = Literal::enumerator_for(&imm.floatcc, "ugt");
746 let ult = Literal::enumerator_for(&imm.floatcc, "ult");
747 let uge = Literal::enumerator_for(&imm.floatcc, "uge");
748 let ule = Literal::enumerator_for(&imm.floatcc, "ule");
749 for ty in &[F32, F64] {
750 let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
751 narrow.legalize(def!(c = fcmp_(gt, a, b)), vec![def!(c = fcmp(lt, b, a))]);
752 let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
753 narrow.legalize(def!(c = fcmp_(ge, a, b)), vec![def!(c = fcmp(le, b, a))]);
754 let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
755 narrow.legalize(def!(c = fcmp_(ult, a, b)), vec![def!(c = fcmp(ugt, b, a))]);
756 let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
757 narrow.legalize(def!(c = fcmp_(ule, a, b)), vec![def!(c = fcmp(uge, b, a))]);
758 }
759
760 for ty in &[F32, F64] {
761 let fneg = fneg.bind(vector(*ty, sse_vector_size));
762 let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
763 let uimm8_shift = Literal::constant(&imm.uimm8, lane_type_as_int.lane_bits() as i64 - 1);
764 let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
765 let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
766 narrow.legalize(
767 def!(b = fneg(a)),
768 vec![
769 def!(c = vconst(u128_ones)),
770 def!(d = ishl_imm(c, uimm8_shift)), // Create a mask of all 0s except the MSB.
771 def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type.
772 def!(b = bxor(a, e)), // Flip the MSB.
773 ],
774 );
775 }
776
777 // SIMD fabs
778 for ty in &[F32, F64] {
779 let fabs = fabs.bind(vector(*ty, sse_vector_size));
780 let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
781 let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
782 let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
783 narrow.legalize(
784 def!(b = fabs(a)),
785 vec![
786 def!(c = vconst(u128_ones)),
787 def!(d = ushr_imm(c, uimm8_one)), // Create a mask of all 1s except the MSB.
788 def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type.
789 def!(b = band(a, e)), // Unset the MSB.
790 ],
791 );
792 }
793
794 // SIMD widen
795 for ty in &[I8, I16] {
796 let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
797 narrow.legalize(
798 def!(b = swiden_high(a)),
799 vec![
800 def!(c = x86_palignr(a, a, uimm8_eight)),
801 def!(b = swiden_low(c)),
802 ],
803 );
804 let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
805 narrow.legalize(
806 def!(b = uwiden_high(a)),
807 vec![
808 def!(c = x86_palignr(a, a, uimm8_eight)),
809 def!(b = uwiden_low(c)),
810 ],
811 );
812 }
813
814 narrow.custom_legalize(shuffle, "convert_shuffle");
815 narrow.custom_legalize(extractlane, "convert_extractlane");
816 narrow.custom_legalize(insertlane, "convert_insertlane");
817 narrow.custom_legalize(ineg, "convert_ineg");
818 narrow.custom_legalize(ushr, "convert_ushr");
819 narrow.custom_legalize(ishl, "convert_ishl");
820 narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
821 narrow.custom_legalize(fmin, "expand_minmax_vector");
822 narrow.custom_legalize(fmax, "expand_minmax_vector");
823
824 narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
825 narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
826 narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
827 }
828