1 use crate::cdsl::ast::{constant, var, ExprBuilder, Literal};
2 use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
3 use crate::cdsl::types::{LaneType, ValueType};
4 use crate::cdsl::xform::TransformGroupBuilder;
5 use crate::shared::types::Float::{F32, F64};
6 use crate::shared::types::Int::{I16, I32, I64, I8};
7 use crate::shared::Definitions as SharedDefinitions;
8 
9 #[allow(clippy::many_single_char_names)]
define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup)10 pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
11     let mut expand = TransformGroupBuilder::new(
12         "x86_expand",
13         r#"
14     Legalize instructions by expansion.
15 
16     Use x86-specific instructions if needed."#,
17     )
18     .isa("x86")
19     .chain_with(shared.transform_groups.by_name("expand_flags").id);
20 
21     let mut narrow = TransformGroupBuilder::new(
22         "x86_narrow",
23         r#"
24     Legalize instructions by narrowing.
25 
26     Use x86-specific instructions if needed."#,
27     )
28     .isa("x86")
29     .chain_with(shared.transform_groups.by_name("narrow_flags").id);
30 
31     let mut narrow_avx = TransformGroupBuilder::new(
32         "x86_narrow_avx",
33         r#"
34     Legalize instructions by narrowing with CPU feature checks.
35 
36     This special case converts using x86 AVX instructions where available."#,
37     )
38     .isa("x86");
39     // We cannot chain with the x86_narrow group until this group is built, see bottom of this
40     // function for where this is chained.
41 
42     let mut widen = TransformGroupBuilder::new(
43         "x86_widen",
44         r#"
45     Legalize instructions by widening.
46 
47     Use x86-specific instructions if needed."#,
48     )
49     .isa("x86")
50     .chain_with(shared.transform_groups.by_name("widen").id);
51 
52     // List of instructions.
53     let insts = &shared.instructions;
54     let band = insts.by_name("band");
55     let bor = insts.by_name("bor");
56     let clz = insts.by_name("clz");
57     let ctz = insts.by_name("ctz");
58     let fcmp = insts.by_name("fcmp");
59     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
60     let fcvt_to_sint = insts.by_name("fcvt_to_sint");
61     let fcvt_to_uint = insts.by_name("fcvt_to_uint");
62     let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
63     let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
64     let fmax = insts.by_name("fmax");
65     let fmin = insts.by_name("fmin");
66     let iadd = insts.by_name("iadd");
67     let iconst = insts.by_name("iconst");
68     let imul = insts.by_name("imul");
69     let ineg = insts.by_name("ineg");
70     let isub = insts.by_name("isub");
71     let ishl = insts.by_name("ishl");
72     let ireduce = insts.by_name("ireduce");
73     let popcnt = insts.by_name("popcnt");
74     let sdiv = insts.by_name("sdiv");
75     let selectif = insts.by_name("selectif");
76     let smulhi = insts.by_name("smulhi");
77     let srem = insts.by_name("srem");
78     let tls_value = insts.by_name("tls_value");
79     let udiv = insts.by_name("udiv");
80     let umulhi = insts.by_name("umulhi");
81     let ushr = insts.by_name("ushr");
82     let ushr_imm = insts.by_name("ushr_imm");
83     let urem = insts.by_name("urem");
84 
85     let x86_bsf = x86_instructions.by_name("x86_bsf");
86     let x86_bsr = x86_instructions.by_name("x86_bsr");
87     let x86_umulx = x86_instructions.by_name("x86_umulx");
88     let x86_smulx = x86_instructions.by_name("x86_smulx");
89 
90     let imm = &shared.imm;
91 
92     // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
93     // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
94     // not encodable.
95     let a = var("a");
96     let x = var("x");
97     let y = var("y");
98     let z = var("z");
99 
100     for &ty in &[I8, I16, I32] {
101         let ishl_by_i64 = ishl.bind(ty).bind(I64);
102         let ireduce = ireduce.bind(I32);
103         expand.legalize(
104             def!(a = ishl_by_i64(x, y)),
105             vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
106         );
107     }
108 
109     for &ty in &[I8, I16, I32] {
110         let ushr_by_i64 = ushr.bind(ty).bind(I64);
111         let ireduce = ireduce.bind(I32);
112         expand.legalize(
113             def!(a = ushr_by_i64(x, y)),
114             vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
115         );
116     }
117 
118     // Division and remainder.
119     //
120     // The srem expansion requires custom code because srem INT_MIN, -1 is not
121     // allowed to trap. The other ops need to check avoid_div_traps.
122     expand.custom_legalize(sdiv, "expand_sdivrem");
123     expand.custom_legalize(srem, "expand_sdivrem");
124     expand.custom_legalize(udiv, "expand_udivrem");
125     expand.custom_legalize(urem, "expand_udivrem");
126 
127     // Double length (widening) multiplication.
128     let a = var("a");
129     let x = var("x");
130     let y = var("y");
131     let a1 = var("a1");
132     let a2 = var("a2");
133     let res_lo = var("res_lo");
134     let res_hi = var("res_hi");
135 
136     expand.legalize(
137         def!(res_hi = umulhi(x, y)),
138         vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
139     );
140 
141     expand.legalize(
142         def!(res_hi = smulhi(x, y)),
143         vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
144     );
145 
146     // Floating point condition codes.
147     //
148     // The 8 condition codes in `supported_floatccs` are directly supported by a
149     // `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
150     // patterns.
151 
152     let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq");
153     let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord");
154     let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq");
155     let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne");
156     let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno");
157     let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");
158 
159     // Equality needs an explicit `ord` test which checks the parity bit.
160     expand.legalize(
161         def!(a = fcmp(floatcc_eq, x, y)),
162         vec![
163             def!(a1 = fcmp(floatcc_ord, x, y)),
164             def!(a2 = fcmp(floatcc_ueq, x, y)),
165             def!(a = band(a1, a2)),
166         ],
167     );
168     expand.legalize(
169         def!(a = fcmp(floatcc_ne, x, y)),
170         vec![
171             def!(a1 = fcmp(floatcc_uno, x, y)),
172             def!(a2 = fcmp(floatcc_one, x, y)),
173             def!(a = bor(a1, a2)),
174         ],
175     );
176 
177     let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt");
178     let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt");
179     let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le");
180     let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge");
181     let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt");
182     let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult");
183     let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge");
184     let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule");
185 
186     // Inequalities that need to be reversed.
187     for &(cc, rev_cc) in &[
188         (floatcc_lt, floatcc_gt),
189         (floatcc_le, floatcc_ge),
190         (floatcc_ugt, floatcc_ult),
191         (floatcc_uge, floatcc_ule),
192     ] {
193         expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
194     }
195 
196     // We need to modify the CFG for min/max legalization.
197     expand.custom_legalize(fmin, "expand_minmax");
198     expand.custom_legalize(fmax, "expand_minmax");
199 
200     // Conversions from unsigned need special handling.
201     expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
202     // Conversions from float to int can trap and modify the control flow graph.
203     expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
204     expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
205     expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
206     expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
207 
208     // Count leading and trailing zeroes, for baseline x86_64
209     let c_minus_one = var("c_minus_one");
210     let c_thirty_one = var("c_thirty_one");
211     let c_thirty_two = var("c_thirty_two");
212     let c_sixty_three = var("c_sixty_three");
213     let c_sixty_four = var("c_sixty_four");
214     let index1 = var("index1");
215     let r2flags = var("r2flags");
216     let index2 = var("index2");
217 
218     let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
219     let imm64_minus_one = Literal::constant(&imm.imm64, -1);
220     let imm64_63 = Literal::constant(&imm.imm64, 63);
221     expand.legalize(
222         def!(a = clz.I64(x)),
223         vec![
224             def!(c_minus_one = iconst(imm64_minus_one)),
225             def!(c_sixty_three = iconst(imm64_63)),
226             def!((index1, r2flags) = x86_bsr(x)),
227             def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
228             def!(a = isub(c_sixty_three, index2)),
229         ],
230     );
231 
232     let imm64_31 = Literal::constant(&imm.imm64, 31);
233     expand.legalize(
234         def!(a = clz.I32(x)),
235         vec![
236             def!(c_minus_one = iconst(imm64_minus_one)),
237             def!(c_thirty_one = iconst(imm64_31)),
238             def!((index1, r2flags) = x86_bsr(x)),
239             def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
240             def!(a = isub(c_thirty_one, index2)),
241         ],
242     );
243 
244     let imm64_64 = Literal::constant(&imm.imm64, 64);
245     expand.legalize(
246         def!(a = ctz.I64(x)),
247         vec![
248             def!(c_sixty_four = iconst(imm64_64)),
249             def!((index1, r2flags) = x86_bsf(x)),
250             def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)),
251         ],
252     );
253 
254     let imm64_32 = Literal::constant(&imm.imm64, 32);
255     expand.legalize(
256         def!(a = ctz.I32(x)),
257         vec![
258             def!(c_thirty_two = iconst(imm64_32)),
259             def!((index1, r2flags) = x86_bsf(x)),
260             def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)),
261         ],
262     );
263 
264     // Population count for baseline x86_64
265     let x = var("x");
266     let r = var("r");
267 
268     let qv3 = var("qv3");
269     let qv4 = var("qv4");
270     let qv5 = var("qv5");
271     let qv6 = var("qv6");
272     let qv7 = var("qv7");
273     let qv8 = var("qv8");
274     let qv9 = var("qv9");
275     let qv10 = var("qv10");
276     let qv11 = var("qv11");
277     let qv12 = var("qv12");
278     let qv13 = var("qv13");
279     let qv14 = var("qv14");
280     let qv15 = var("qv15");
281     let qc77 = var("qc77");
282     #[allow(non_snake_case)]
283     let qc0F = var("qc0F");
284     let qc01 = var("qc01");
285 
286     let imm64_1 = Literal::constant(&imm.imm64, 1);
287     let imm64_4 = Literal::constant(&imm.imm64, 4);
288     expand.legalize(
289         def!(r = popcnt.I64(x)),
290         vec![
291             def!(qv3 = ushr_imm(x, imm64_1)),
292             def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777_7777_7777))),
293             def!(qv4 = band(qv3, qc77)),
294             def!(qv5 = isub(x, qv4)),
295             def!(qv6 = ushr_imm(qv4, imm64_1)),
296             def!(qv7 = band(qv6, qc77)),
297             def!(qv8 = isub(qv5, qv7)),
298             def!(qv9 = ushr_imm(qv7, imm64_1)),
299             def!(qv10 = band(qv9, qc77)),
300             def!(qv11 = isub(qv8, qv10)),
301             def!(qv12 = ushr_imm(qv11, imm64_4)),
302             def!(qv13 = iadd(qv11, qv12)),
303             def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F_0F0F_0F0F))),
304             def!(qv14 = band(qv13, qc0F)),
305             def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101_0101_0101))),
306             def!(qv15 = imul(qv14, qc01)),
307             def!(r = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))),
308         ],
309     );
310 
311     let lv3 = var("lv3");
312     let lv4 = var("lv4");
313     let lv5 = var("lv5");
314     let lv6 = var("lv6");
315     let lv7 = var("lv7");
316     let lv8 = var("lv8");
317     let lv9 = var("lv9");
318     let lv10 = var("lv10");
319     let lv11 = var("lv11");
320     let lv12 = var("lv12");
321     let lv13 = var("lv13");
322     let lv14 = var("lv14");
323     let lv15 = var("lv15");
324     let lc77 = var("lc77");
325     #[allow(non_snake_case)]
326     let lc0F = var("lc0F");
327     let lc01 = var("lc01");
328 
329     expand.legalize(
330         def!(r = popcnt.I32(x)),
331         vec![
332             def!(lv3 = ushr_imm(x, imm64_1)),
333             def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777))),
334             def!(lv4 = band(lv3, lc77)),
335             def!(lv5 = isub(x, lv4)),
336             def!(lv6 = ushr_imm(lv4, imm64_1)),
337             def!(lv7 = band(lv6, lc77)),
338             def!(lv8 = isub(lv5, lv7)),
339             def!(lv9 = ushr_imm(lv7, imm64_1)),
340             def!(lv10 = band(lv9, lc77)),
341             def!(lv11 = isub(lv8, lv10)),
342             def!(lv12 = ushr_imm(lv11, imm64_4)),
343             def!(lv13 = iadd(lv11, lv12)),
344             def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F))),
345             def!(lv14 = band(lv13, lc0F)),
346             def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101))),
347             def!(lv15 = imul(lv14, lc01)),
348             def!(r = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))),
349         ],
350     );
351 
352     expand.custom_legalize(ineg, "convert_ineg");
353     expand.custom_legalize(tls_value, "expand_tls_value");
354     widen.custom_legalize(ineg, "convert_ineg");
355 
356     // To reduce compilation times, separate out large blocks of legalizations by theme.
357     define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
358 
359     expand.build_and_add_to(&mut shared.transform_groups);
360     let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
361     narrow_avx
362         .chain_with(narrow_id)
363         .build_and_add_to(&mut shared.transform_groups);
364     widen.build_and_add_to(&mut shared.transform_groups);
365 }
366 
define_simd( shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup, narrow: &mut TransformGroupBuilder, narrow_avx: &mut TransformGroupBuilder, )367 fn define_simd(
368     shared: &mut SharedDefinitions,
369     x86_instructions: &InstructionGroup,
370     narrow: &mut TransformGroupBuilder,
371     narrow_avx: &mut TransformGroupBuilder,
372 ) {
373     let insts = &shared.instructions;
374     let band = insts.by_name("band");
375     let band_not = insts.by_name("band_not");
376     let bitcast = insts.by_name("bitcast");
377     let bitselect = insts.by_name("bitselect");
378     let bor = insts.by_name("bor");
379     let bnot = insts.by_name("bnot");
380     let bxor = insts.by_name("bxor");
381     let extractlane = insts.by_name("extractlane");
382     let fabs = insts.by_name("fabs");
383     let fcmp = insts.by_name("fcmp");
384     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
385     let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
386     let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
387     let fmax = insts.by_name("fmax");
388     let fmin = insts.by_name("fmin");
389     let fneg = insts.by_name("fneg");
390     let iadd_imm = insts.by_name("iadd_imm");
391     let icmp = insts.by_name("icmp");
392     let imax = insts.by_name("imax");
393     let imin = insts.by_name("imin");
394     let imul = insts.by_name("imul");
395     let ineg = insts.by_name("ineg");
396     let insertlane = insts.by_name("insertlane");
397     let ishl = insts.by_name("ishl");
398     let ishl_imm = insts.by_name("ishl_imm");
399     let raw_bitcast = insts.by_name("raw_bitcast");
400     let scalar_to_vector = insts.by_name("scalar_to_vector");
401     let splat = insts.by_name("splat");
402     let shuffle = insts.by_name("shuffle");
403     let sshr = insts.by_name("sshr");
404     let swizzle = insts.by_name("swizzle");
405     let trueif = insts.by_name("trueif");
406     let uadd_sat = insts.by_name("uadd_sat");
407     let umax = insts.by_name("umax");
408     let umin = insts.by_name("umin");
409     let snarrow = insts.by_name("snarrow");
410     let swiden_high = insts.by_name("swiden_high");
411     let swiden_low = insts.by_name("swiden_low");
412     let ushr_imm = insts.by_name("ushr_imm");
413     let ushr = insts.by_name("ushr");
414     let uwiden_high = insts.by_name("uwiden_high");
415     let uwiden_low = insts.by_name("uwiden_low");
416     let vconst = insts.by_name("vconst");
417     let vall_true = insts.by_name("vall_true");
418     let vany_true = insts.by_name("vany_true");
419     let vselect = insts.by_name("vselect");
420 
421     let x86_palignr = x86_instructions.by_name("x86_palignr");
422     let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
423     let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
424     let x86_pmins = x86_instructions.by_name("x86_pmins");
425     let x86_pminu = x86_instructions.by_name("x86_pminu");
426     let x86_pshufb = x86_instructions.by_name("x86_pshufb");
427     let x86_pshufd = x86_instructions.by_name("x86_pshufd");
428     let x86_psra = x86_instructions.by_name("x86_psra");
429     let x86_ptest = x86_instructions.by_name("x86_ptest");
430     let x86_punpckh = x86_instructions.by_name("x86_punpckh");
431     let x86_punpckl = x86_instructions.by_name("x86_punpckl");
432 
433     let imm = &shared.imm;
434 
435     // Set up variables and immediates.
436     let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
437     let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
438     let uimm8_eight = Literal::constant(&imm.uimm8, 8);
439     let u128_zeroes = constant(vec![0x00; 16]);
440     let u128_ones = constant(vec![0xff; 16]);
441     let u128_seventies = constant(vec![0x70; 16]);
442     let a = var("a");
443     let b = var("b");
444     let c = var("c");
445     let d = var("d");
446     let e = var("e");
447     let f = var("f");
448     let g = var("g");
449     let h = var("h");
450     let x = var("x");
451     let y = var("y");
452     let z = var("z");
453 
454     // Limit the SIMD vector size: eventually multiple vector sizes may be supported
455     // but for now only SSE-sized vectors are available.
456     let sse_vector_size: u64 = 128;
457     let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
458 
459     // SIMD splat: 8-bits
460     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
461         let splat_any8x16 = splat.bind(vector(ty, sse_vector_size));
462         narrow.legalize(
463             def!(y = splat_any8x16(x)),
464             vec![
465                 // Move into the lowest 8 bits of an XMM register.
466                 def!(a = scalar_to_vector(x)),
467                 // Zero out a different XMM register; the shuffle mask for moving the lowest byte
468                 // to all other byte lanes is 0x0.
469                 def!(b = vconst(u128_zeroes)),
470                 // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b).
471                 def!(y = x86_pshufb(a, b)),
472             ],
473         );
474     }
475 
476     // SIMD splat: 16-bits
477     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
478         let splat_x16x8 = splat.bind(vector(ty, sse_vector_size));
479         let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
480             .bind(vector(I32, sse_vector_size))
481             .bind(vector(ty, sse_vector_size));
482         let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
483             .bind(vector(ty, sse_vector_size))
484             .bind(vector(I32, sse_vector_size));
485         narrow.legalize(
486             def!(y = splat_x16x8(x)),
487             vec![
488                 // Move into the lowest 16 bits of an XMM register.
489                 def!(a = scalar_to_vector(x)),
490                 // Insert the value again but in the next lowest 16 bits.
491                 def!(b = insertlane(a, x, uimm8_one)),
492                 // No instruction emitted; pretend this is an I32x4 so we can use PSHUFD.
493                 def!(c = raw_bitcast_any16x8_to_i32x4(b)),
494                 // Broadcast the bytes in the XMM register with PSHUFD.
495                 def!(d = x86_pshufd(c, uimm8_zero)),
496                 // No instruction emitted; pretend this is an X16x8 again.
497                 def!(y = raw_bitcast_i32x4_to_any16x8(d)),
498             ],
499         );
500     }
501 
502     // SIMD splat: 32-bits
503     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
504         let splat_any32x4 = splat.bind(vector(ty, sse_vector_size));
505         narrow.legalize(
506             def!(y = splat_any32x4(x)),
507             vec![
508                 // Translate to an x86 MOV to get the value in an XMM register.
509                 def!(a = scalar_to_vector(x)),
510                 // Broadcast the bytes in the XMM register with PSHUFD.
511                 def!(y = x86_pshufd(a, uimm8_zero)),
512             ],
513         );
514     }
515 
516     // SIMD splat: 64-bits
517     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
518         let splat_any64x2 = splat.bind(vector(ty, sse_vector_size));
519         narrow.legalize(
520             def!(y = splat_any64x2(x)),
521             vec![
522                 // Move into the lowest 64 bits of an XMM register.
523                 def!(a = scalar_to_vector(x)),
524                 // Move into the highest 64 bits of the same XMM register.
525                 def!(y = insertlane(a, x, uimm8_one)),
526             ],
527         );
528     }
529 
530     // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring
531     // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion,
532     // see https://github.com/WebAssembly/simd/issues/93.
533     {
534         let swizzle = swizzle.bind(vector(I8, sse_vector_size));
535         narrow.legalize(
536             def!(a = swizzle(x, y)),
537             vec![
538                 def!(b = vconst(u128_seventies)),
539                 def!(c = uadd_sat(y, b)),
540                 def!(a = x86_pshufb(x, c)),
541             ],
542         );
543     }
544 
545     // SIMD bnot
546     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
547         let bnot = bnot.bind(vector(ty, sse_vector_size));
548         narrow.legalize(
549             def!(y = bnot(x)),
550             vec![def!(a = vconst(u128_ones)), def!(y = bxor(a, x))],
551         );
552     }
553 
554     // SIMD shift right (arithmetic, i16x8 and i32x4)
555     for ty in &[I16, I32] {
556         let sshr = sshr.bind(vector(*ty, sse_vector_size));
557         let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
558         narrow.legalize(
559             def!(a = sshr(x, y)),
560             vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
561         );
562     }
563     // SIMD shift right (arithmetic, i8x16)
564     {
565         let sshr = sshr.bind(vector(I8, sse_vector_size));
566         let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
567         let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
568         let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
569         narrow.legalize(
570             def!(z = sshr(x, y)),
571             vec![
572                 // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
573                 def!(a = iadd_imm(y, uimm8_eight)),
574                 def!(b = bitcast_i64x2(a)),
575                 // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
576                 def!(c = x86_punpckl(x, x)),
577                 def!(d = raw_bitcast_i16x8(c)),
578                 def!(e = x86_psra(d, b)),
579                 // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
580                 def!(f = x86_punpckh(x, x)),
581                 def!(g = raw_bitcast_i16x8_again(f)),
582                 def!(h = x86_psra(g, b)),
583                 // Re-pack the vector.
584                 def!(z = snarrow(e, h)),
585             ],
586         );
587     }
588     // SIMD shift right (arithmetic, i64x2)
589     {
590         let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
591         let sshr_scalar_lane0 = sshr.bind(I64);
592         let sshr_scalar_lane1 = sshr.bind(I64);
593         narrow.legalize(
594             def!(z = sshr_vector(x, y)),
595             vec![
596                 // Use scalar operations to shift the first lane.
597                 def!(a = extractlane(x, uimm8_zero)),
598                 def!(b = sshr_scalar_lane0(a, y)),
599                 def!(c = insertlane(x, b, uimm8_zero)),
600                 // Do the same for the second lane.
601                 def!(d = extractlane(x, uimm8_one)),
602                 def!(e = sshr_scalar_lane1(d, y)),
603                 def!(z = insertlane(c, e, uimm8_one)),
604             ],
605         );
606     }
607 
608     // SIMD select
609     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
610         let bitselect = bitselect.bind(vector(ty, sse_vector_size)); // must bind both x/y and c
611         narrow.legalize(
612             def!(d = bitselect(c, x, y)),
613             vec![
614                 def!(a = band(x, c)),
615                 def!(b = band_not(y, c)),
616                 def!(d = bor(a, b)),
617             ],
618         );
619     }
620 
621     // SIMD vselect; replace with bitselect if BLEND* instructions are not available.
622     // This works, because each lane of boolean vector is filled with zeroes or ones.
623     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
624         let vselect = vselect.bind(vector(ty, sse_vector_size));
625         let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
626         narrow.legalize(
627             def!(d = vselect(c, x, y)),
628             vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
629         );
630     }
631 
632     // SIMD vany_true
633     let ne = Literal::enumerator_for(&imm.intcc, "ne");
634     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
635         let vany_true = vany_true.bind(vector(ty, sse_vector_size));
636         narrow.legalize(
637             def!(y = vany_true(x)),
638             vec![def!(a = x86_ptest(x, x)), def!(y = trueif(ne, a))],
639         );
640     }
641 
642     // SIMD vall_true
643     let eq = Literal::enumerator_for(&imm.intcc, "eq");
644     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
645         let vall_true = vall_true.bind(vector(ty, sse_vector_size));
646         if ty.is_int() {
647             // In the common case (Wasm's integer-only all_true), we do not require a
648             // bitcast.
649             narrow.legalize(
650                 def!(y = vall_true(x)),
651                 vec![
652                     def!(a = vconst(u128_zeroes)),
653                     def!(c = icmp(eq, x, a)),
654                     def!(d = x86_ptest(c, c)),
655                     def!(y = trueif(eq, d)),
656                 ],
657             );
658         } else {
659             // However, to support other types we must bitcast them to an integer vector to
660             // use icmp.
661             let lane_type_as_int = LaneType::int_from_bits(ty.lane_bits() as u16);
662             let raw_bitcast_to_int = raw_bitcast.bind(vector(lane_type_as_int, sse_vector_size));
663             narrow.legalize(
664                 def!(y = vall_true(x)),
665                 vec![
666                     def!(a = vconst(u128_zeroes)),
667                     def!(b = raw_bitcast_to_int(x)),
668                     def!(c = icmp(eq, b, a)),
669                     def!(d = x86_ptest(c, c)),
670                     def!(y = trueif(eq, d)),
671                 ],
672             );
673         }
674     }
675 
676     // SIMD icmp ne
677     let ne = Literal::enumerator_for(&imm.intcc, "ne");
678     for ty in ValueType::all_lane_types().filter(|ty| allowed_simd_type(ty) && ty.is_int()) {
679         let icmp_ = icmp.bind(vector(ty, sse_vector_size));
680         narrow.legalize(
681             def!(c = icmp_(ne, a, b)),
682             vec![def!(x = icmp(eq, a, b)), def!(c = bnot(x))],
683         );
684     }
685 
686     // SIMD icmp greater-/less-than
687     let sgt = Literal::enumerator_for(&imm.intcc, "sgt");
688     let ugt = Literal::enumerator_for(&imm.intcc, "ugt");
689     let sge = Literal::enumerator_for(&imm.intcc, "sge");
690     let uge = Literal::enumerator_for(&imm.intcc, "uge");
691     let slt = Literal::enumerator_for(&imm.intcc, "slt");
692     let ult = Literal::enumerator_for(&imm.intcc, "ult");
693     let sle = Literal::enumerator_for(&imm.intcc, "sle");
694     let ule = Literal::enumerator_for(&imm.intcc, "ule");
695     for ty in &[I8, I16, I32] {
696         // greater-than
697         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
698         narrow.legalize(
699             def!(c = icmp_(ugt, a, b)),
700             vec![
701                 def!(x = x86_pmaxu(a, b)),
702                 def!(y = icmp(eq, x, b)),
703                 def!(c = bnot(y)),
704             ],
705         );
706         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
707         narrow.legalize(
708             def!(c = icmp_(sge, a, b)),
709             vec![def!(x = x86_pmins(a, b)), def!(c = icmp(eq, x, b))],
710         );
711         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
712         narrow.legalize(
713             def!(c = icmp_(uge, a, b)),
714             vec![def!(x = x86_pminu(a, b)), def!(c = icmp(eq, x, b))],
715         );
716 
717         // less-than
718         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
719         narrow.legalize(def!(c = icmp_(slt, a, b)), vec![def!(c = icmp(sgt, b, a))]);
720         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
721         narrow.legalize(def!(c = icmp_(ult, a, b)), vec![def!(c = icmp(ugt, b, a))]);
722         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
723         narrow.legalize(def!(c = icmp_(sle, a, b)), vec![def!(c = icmp(sge, b, a))]);
724         let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
725         narrow.legalize(def!(c = icmp_(ule, a, b)), vec![def!(c = icmp(uge, b, a))]);
726     }
727 
728     // SIMD integer min/max
729     for ty in &[I8, I16, I32] {
730         let imin = imin.bind(vector(*ty, sse_vector_size));
731         narrow.legalize(def!(c = imin(a, b)), vec![def!(c = x86_pmins(a, b))]);
732         let umin = umin.bind(vector(*ty, sse_vector_size));
733         narrow.legalize(def!(c = umin(a, b)), vec![def!(c = x86_pminu(a, b))]);
734         let imax = imax.bind(vector(*ty, sse_vector_size));
735         narrow.legalize(def!(c = imax(a, b)), vec![def!(c = x86_pmaxs(a, b))]);
736         let umax = umax.bind(vector(*ty, sse_vector_size));
737         narrow.legalize(def!(c = umax(a, b)), vec![def!(c = x86_pmaxu(a, b))]);
738     }
739 
740     // SIMD fcmp greater-/less-than
741     let gt = Literal::enumerator_for(&imm.floatcc, "gt");
742     let lt = Literal::enumerator_for(&imm.floatcc, "lt");
743     let ge = Literal::enumerator_for(&imm.floatcc, "ge");
744     let le = Literal::enumerator_for(&imm.floatcc, "le");
745     let ugt = Literal::enumerator_for(&imm.floatcc, "ugt");
746     let ult = Literal::enumerator_for(&imm.floatcc, "ult");
747     let uge = Literal::enumerator_for(&imm.floatcc, "uge");
748     let ule = Literal::enumerator_for(&imm.floatcc, "ule");
749     for ty in &[F32, F64] {
750         let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
751         narrow.legalize(def!(c = fcmp_(gt, a, b)), vec![def!(c = fcmp(lt, b, a))]);
752         let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
753         narrow.legalize(def!(c = fcmp_(ge, a, b)), vec![def!(c = fcmp(le, b, a))]);
754         let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
755         narrow.legalize(def!(c = fcmp_(ult, a, b)), vec![def!(c = fcmp(ugt, b, a))]);
756         let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
757         narrow.legalize(def!(c = fcmp_(ule, a, b)), vec![def!(c = fcmp(uge, b, a))]);
758     }
759 
760     for ty in &[F32, F64] {
761         let fneg = fneg.bind(vector(*ty, sse_vector_size));
762         let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
763         let uimm8_shift = Literal::constant(&imm.uimm8, lane_type_as_int.lane_bits() as i64 - 1);
764         let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
765         let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
766         narrow.legalize(
767             def!(b = fneg(a)),
768             vec![
769                 def!(c = vconst(u128_ones)),
770                 def!(d = ishl_imm(c, uimm8_shift)), // Create a mask of all 0s except the MSB.
771                 def!(e = bitcast_to_float(d)),      // Cast mask to the floating-point type.
772                 def!(b = bxor(a, e)),               // Flip the MSB.
773             ],
774         );
775     }
776 
777     // SIMD fabs
778     for ty in &[F32, F64] {
779         let fabs = fabs.bind(vector(*ty, sse_vector_size));
780         let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
781         let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
782         let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
783         narrow.legalize(
784             def!(b = fabs(a)),
785             vec![
786                 def!(c = vconst(u128_ones)),
787                 def!(d = ushr_imm(c, uimm8_one)), // Create a mask of all 1s except the MSB.
788                 def!(e = bitcast_to_float(d)),    // Cast mask to the floating-point type.
789                 def!(b = band(a, e)),             // Unset the MSB.
790             ],
791         );
792     }
793 
794     // SIMD widen
795     for ty in &[I8, I16] {
796         let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
797         narrow.legalize(
798             def!(b = swiden_high(a)),
799             vec![
800                 def!(c = x86_palignr(a, a, uimm8_eight)),
801                 def!(b = swiden_low(c)),
802             ],
803         );
804         let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
805         narrow.legalize(
806             def!(b = uwiden_high(a)),
807             vec![
808                 def!(c = x86_palignr(a, a, uimm8_eight)),
809                 def!(b = uwiden_low(c)),
810             ],
811         );
812     }
813 
814     narrow.custom_legalize(shuffle, "convert_shuffle");
815     narrow.custom_legalize(extractlane, "convert_extractlane");
816     narrow.custom_legalize(insertlane, "convert_insertlane");
817     narrow.custom_legalize(ineg, "convert_ineg");
818     narrow.custom_legalize(ushr, "convert_ushr");
819     narrow.custom_legalize(ishl, "convert_ishl");
820     narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
821     narrow.custom_legalize(fmin, "expand_minmax_vector");
822     narrow.custom_legalize(fmax, "expand_minmax_vector");
823 
824     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
825     narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
826     narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
827 }
828