1 use crate::cdsl::ast::{var, ExprBuilder, Literal};
2 use crate::cdsl::instructions::InstructionGroup;
3 use crate::cdsl::types::ValueType;
4 use crate::cdsl::xform::TransformGroupBuilder;
5 use crate::shared::types::Float::F64;
6 use crate::shared::types::Int::{I32, I64};
7 use crate::shared::Definitions as SharedDefinitions;
8 
define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup)9 pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
10     let mut group = TransformGroupBuilder::new(
11         "x86_expand",
12         r#"
13     Legalize instructions by expansion.
14 
15     Use x86-specific instructions if needed."#,
16     )
17     .isa("x86")
18     .chain_with(shared.transform_groups.by_name("expand_flags").id);
19 
20     // List of instructions.
21     let insts = &shared.instructions;
22     let band = insts.by_name("band");
23     let bor = insts.by_name("bor");
24     let clz = insts.by_name("clz");
25     let ctz = insts.by_name("ctz");
26     let extractlane = insts.by_name("extractlane");
27     let f64const = insts.by_name("f64const");
28     let fcmp = insts.by_name("fcmp");
29     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
30     let fcvt_to_sint = insts.by_name("fcvt_to_sint");
31     let fcvt_to_uint = insts.by_name("fcvt_to_uint");
32     let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
33     let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
34     let fmax = insts.by_name("fmax");
35     let fmin = insts.by_name("fmin");
36     let iadd = insts.by_name("iadd");
37     let iconst = insts.by_name("iconst");
38     let imul = insts.by_name("imul");
39     let insertlane = insts.by_name("insertlane");
40     let isub = insts.by_name("isub");
41     let popcnt = insts.by_name("popcnt");
42     let raw_bitcast = insts.by_name("raw_bitcast");
43     let scalar_to_vector = insts.by_name("scalar_to_vector");
44     let sdiv = insts.by_name("sdiv");
45     let selectif = insts.by_name("selectif");
46     let smulhi = insts.by_name("smulhi");
47     let splat = insts.by_name("splat");
48     let shuffle = insts.by_name("shuffle");
49     let srem = insts.by_name("srem");
50     let udiv = insts.by_name("udiv");
51     let umulhi = insts.by_name("umulhi");
52     let ushr_imm = insts.by_name("ushr_imm");
53     let urem = insts.by_name("urem");
54 
55     let x86_bsf = x86_instructions.by_name("x86_bsf");
56     let x86_bsr = x86_instructions.by_name("x86_bsr");
57     let x86_pshufb = x86_instructions.by_name("x86_pshufb");
58     let x86_pshufd = x86_instructions.by_name("x86_pshufd");
59     let x86_umulx = x86_instructions.by_name("x86_umulx");
60     let x86_smulx = x86_instructions.by_name("x86_smulx");
61 
62     let imm = &shared.imm;
63 
64     // Division and remainder.
65     //
66     // The srem expansion requires custom code because srem INT_MIN, -1 is not
67     // allowed to trap. The other ops need to check avoid_div_traps.
68     group.custom_legalize(sdiv, "expand_sdivrem");
69     group.custom_legalize(srem, "expand_sdivrem");
70     group.custom_legalize(udiv, "expand_udivrem");
71     group.custom_legalize(urem, "expand_udivrem");
72 
73     // Double length (widening) multiplication.
74     let a = var("a");
75     let x = var("x");
76     let y = var("y");
77     let a1 = var("a1");
78     let a2 = var("a2");
79     let res_lo = var("res_lo");
80     let res_hi = var("res_hi");
81 
82     group.legalize(
83         def!(res_hi = umulhi(x, y)),
84         vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
85     );
86 
87     group.legalize(
88         def!(res_hi = smulhi(x, y)),
89         vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
90     );
91 
92     // Floating point condition codes.
93     //
94     // The 8 condition codes in `supported_floatccs` are directly supported by a
95     // `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
96     // patterns.
97 
98     let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq");
99     let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord");
100     let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq");
101     let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne");
102     let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno");
103     let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");
104 
105     // Equality needs an explicit `ord` test which checks the parity bit.
106     group.legalize(
107         def!(a = fcmp(floatcc_eq, x, y)),
108         vec![
109             def!(a1 = fcmp(floatcc_ord, x, y)),
110             def!(a2 = fcmp(floatcc_ueq, x, y)),
111             def!(a = band(a1, a2)),
112         ],
113     );
114     group.legalize(
115         def!(a = fcmp(floatcc_ne, x, y)),
116         vec![
117             def!(a1 = fcmp(floatcc_uno, x, y)),
118             def!(a2 = fcmp(floatcc_one, x, y)),
119             def!(a = bor(a1, a2)),
120         ],
121     );
122 
123     let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt");
124     let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt");
125     let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le");
126     let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge");
127     let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt");
128     let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult");
129     let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge");
130     let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule");
131 
132     // Inequalities that need to be reversed.
133     for &(cc, rev_cc) in &[
134         (floatcc_lt, floatcc_gt),
135         (floatcc_le, floatcc_ge),
136         (floatcc_ugt, floatcc_ult),
137         (floatcc_uge, floatcc_ule),
138     ] {
139         group.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
140     }
141 
142     // We need to modify the CFG for min/max legalization.
143     group.custom_legalize(fmin, "expand_minmax");
144     group.custom_legalize(fmax, "expand_minmax");
145 
146     // Conversions from unsigned need special handling.
147     group.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
148     // Conversions from float to int can trap and modify the control flow graph.
149     group.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
150     group.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
151     group.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
152     group.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
153 
154     // Count leading and trailing zeroes, for baseline x86_64
155     let c_minus_one = var("c_minus_one");
156     let c_thirty_one = var("c_thirty_one");
157     let c_thirty_two = var("c_thirty_two");
158     let c_sixty_three = var("c_sixty_three");
159     let c_sixty_four = var("c_sixty_four");
160     let index1 = var("index1");
161     let r2flags = var("r2flags");
162     let index2 = var("index2");
163 
164     let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
165     let imm64_minus_one = Literal::constant(&imm.imm64, -1);
166     let imm64_63 = Literal::constant(&imm.imm64, 63);
167     group.legalize(
168         def!(a = clz.I64(x)),
169         vec![
170             def!(c_minus_one = iconst(imm64_minus_one)),
171             def!(c_sixty_three = iconst(imm64_63)),
172             def!((index1, r2flags) = x86_bsr(x)),
173             def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
174             def!(a = isub(c_sixty_three, index2)),
175         ],
176     );
177 
178     let imm64_31 = Literal::constant(&imm.imm64, 31);
179     group.legalize(
180         def!(a = clz.I32(x)),
181         vec![
182             def!(c_minus_one = iconst(imm64_minus_one)),
183             def!(c_thirty_one = iconst(imm64_31)),
184             def!((index1, r2flags) = x86_bsr(x)),
185             def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
186             def!(a = isub(c_thirty_one, index2)),
187         ],
188     );
189 
190     let imm64_64 = Literal::constant(&imm.imm64, 64);
191     group.legalize(
192         def!(a = ctz.I64(x)),
193         vec![
194             def!(c_sixty_four = iconst(imm64_64)),
195             def!((index1, r2flags) = x86_bsf(x)),
196             def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)),
197         ],
198     );
199 
200     let imm64_32 = Literal::constant(&imm.imm64, 32);
201     group.legalize(
202         def!(a = ctz.I32(x)),
203         vec![
204             def!(c_thirty_two = iconst(imm64_32)),
205             def!((index1, r2flags) = x86_bsf(x)),
206             def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)),
207         ],
208     );
209 
210     // Population count for baseline x86_64
211     let qv1 = var("qv1");
212     let qv3 = var("qv3");
213     let qv4 = var("qv4");
214     let qv5 = var("qv5");
215     let qv6 = var("qv6");
216     let qv7 = var("qv7");
217     let qv8 = var("qv8");
218     let qv9 = var("qv9");
219     let qv10 = var("qv10");
220     let qv11 = var("qv11");
221     let qv12 = var("qv12");
222     let qv13 = var("qv13");
223     let qv14 = var("qv14");
224     let qv15 = var("qv15");
225     let qv16 = var("qv16");
226     let qc77 = var("qc77");
227     #[allow(non_snake_case)]
228     let qc0F = var("qc0F");
229     let qc01 = var("qc01");
230 
231     let imm64_1 = Literal::constant(&imm.imm64, 1);
232     let imm64_4 = Literal::constant(&imm.imm64, 4);
233     group.legalize(
234         def!(qv16 = popcnt.I64(qv1)),
235         vec![
236             def!(qv3 = ushr_imm(qv1, imm64_1)),
237             def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777777777777777))),
238             def!(qv4 = band(qv3, qc77)),
239             def!(qv5 = isub(qv1, qv4)),
240             def!(qv6 = ushr_imm(qv4, imm64_1)),
241             def!(qv7 = band(qv6, qc77)),
242             def!(qv8 = isub(qv5, qv7)),
243             def!(qv9 = ushr_imm(qv7, imm64_1)),
244             def!(qv10 = band(qv9, qc77)),
245             def!(qv11 = isub(qv8, qv10)),
246             def!(qv12 = ushr_imm(qv11, imm64_4)),
247             def!(qv13 = iadd(qv11, qv12)),
248             def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F0F0F0F0F0F0F))),
249             def!(qv14 = band(qv13, qc0F)),
250             def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101010101010101))),
251             def!(qv15 = imul(qv14, qc01)),
252             def!(qv16 = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))),
253         ],
254     );
255 
256     let lv1 = var("lv1");
257     let lv3 = var("lv3");
258     let lv4 = var("lv4");
259     let lv5 = var("lv5");
260     let lv6 = var("lv6");
261     let lv7 = var("lv7");
262     let lv8 = var("lv8");
263     let lv9 = var("lv9");
264     let lv10 = var("lv10");
265     let lv11 = var("lv11");
266     let lv12 = var("lv12");
267     let lv13 = var("lv13");
268     let lv14 = var("lv14");
269     let lv15 = var("lv15");
270     let lv16 = var("lv16");
271     let lc77 = var("lc77");
272     #[allow(non_snake_case)]
273     let lc0F = var("lc0F");
274     let lc01 = var("lc01");
275 
276     group.legalize(
277         def!(lv16 = popcnt.I32(lv1)),
278         vec![
279             def!(lv3 = ushr_imm(lv1, imm64_1)),
280             def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x77777777))),
281             def!(lv4 = band(lv3, lc77)),
282             def!(lv5 = isub(lv1, lv4)),
283             def!(lv6 = ushr_imm(lv4, imm64_1)),
284             def!(lv7 = band(lv6, lc77)),
285             def!(lv8 = isub(lv5, lv7)),
286             def!(lv9 = ushr_imm(lv7, imm64_1)),
287             def!(lv10 = band(lv9, lc77)),
288             def!(lv11 = isub(lv8, lv10)),
289             def!(lv12 = ushr_imm(lv11, imm64_4)),
290             def!(lv13 = iadd(lv11, lv12)),
291             def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F0F0F))),
292             def!(lv14 = band(lv13, lc0F)),
293             def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x01010101))),
294             def!(lv15 = imul(lv14, lc01)),
295             def!(lv16 = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))),
296         ],
297     );
298 
299     group.build_and_add_to(&mut shared.transform_groups);
300 
301     let mut narrow = TransformGroupBuilder::new(
302         "x86_narrow",
303         r#"
304     Legalize instructions by narrowing.
305 
306     Use x86-specific instructions if needed."#,
307     )
308     .isa("x86")
309     .chain_with(shared.transform_groups.by_name("narrow_flags").id);
310 
311     // SIMD
312     let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
313     let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
314     let ieee64_zero = Literal::constant(&imm.ieee64, 0x00);
315     let b = var("b");
316     let c = var("c");
317     let d = var("d");
318 
319     // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
320     let sse_vector_size: u64 = 128;
321 
322     // SIMD splat: 8-bits
323     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
324         let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
325         let bitcast_f64_to_any8x16 = raw_bitcast
326             .bind_vector_from_lane(ty, sse_vector_size)
327             .bind(F64);
328         narrow.legalize(
329             def!(y = splat_any8x16(x)),
330             vec![
331                 def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
332                 // TODO replace the following two instructions with `vconst(0)` when this is possible; see https://github.com/CraneStation/cranelift/issues/1052
333                 def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
334                 def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type
335                 def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b)
336             ],
337         );
338     }
339 
340     // SIMD splat: 16-bits
341     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
342         let splat_x16x8 = splat.bind_vector_from_lane(ty, sse_vector_size);
343         let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
344             .bind_vector_from_lane(I32, sse_vector_size)
345             .bind_vector_from_lane(ty, sse_vector_size);
346         let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
347             .bind_vector_from_lane(ty, sse_vector_size)
348             .bind_vector_from_lane(I32, sse_vector_size);
349         narrow.legalize(
350             def!(y = splat_x16x8(x)),
351             vec![
352                 def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register
353                 def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits
354                 def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD
355                 def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD
356                 def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again
357             ],
358         );
359     }
360 
361     // SIMD splat: 32-bits
362     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
363         let splat_any32x4 = splat.bind_vector_from_lane(ty, sse_vector_size);
364         narrow.legalize(
365             def!(y = splat_any32x4(x)),
366             vec![
367                 def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register
368                 def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF
369             ],
370         );
371     }
372 
373     // SIMD splat: 64-bits
374     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
375         let splat_any64x2 = splat.bind_vector_from_lane(ty, sse_vector_size);
376         narrow.legalize(
377             def!(y = splat_any64x2(x)),
378             vec![
379                 def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register
380                 def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register
381             ],
382         );
383     }
384 
385     narrow.custom_legalize(shuffle, "convert_shuffle");
386     narrow.custom_legalize(extractlane, "convert_extractlane");
387     narrow.custom_legalize(insertlane, "convert_insertlane");
388 
389     narrow.build_and_add_to(&mut shared.transform_groups);
390 }
391