1 #![allow(non_snake_case)]
2 
3 use cranelift_codegen_shared::condcodes::IntCC;
4 use std::collections::HashMap;
5 
6 use crate::cdsl::encodings::{Encoding, EncodingBuilder};
7 use crate::cdsl::instructions::{
8     vector, Bindable, Immediate, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
9     InstructionPredicateNode, InstructionPredicateRegistry,
10 };
11 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
12 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
13 use crate::cdsl::types::{LaneType, ValueType};
14 use crate::shared::types::Bool::{B1, B16, B32, B64, B8};
15 use crate::shared::types::Float::{F32, F64};
16 use crate::shared::types::Int::{I16, I32, I64, I8};
17 use crate::shared::types::Reference::{R32, R64};
18 use crate::shared::Definitions as SharedDefinitions;
19 
20 use crate::isa::x86::opcodes::*;
21 
22 use super::recipes::{RecipeGroup, Template};
23 use crate::cdsl::instructions::BindParameter::Any;
24 
25 pub(crate) struct PerCpuModeEncodings {
26     pub enc32: Vec<Encoding>,
27     pub enc64: Vec<Encoding>,
28     pub recipes: Recipes,
29     recipes_by_name: HashMap<String, EncodingRecipeNumber>,
30     pub inst_pred_reg: InstructionPredicateRegistry,
31 }
32 
33 impl PerCpuModeEncodings {
new() -> Self34     fn new() -> Self {
35         Self {
36             enc32: Vec::new(),
37             enc64: Vec::new(),
38             recipes: Recipes::new(),
39             recipes_by_name: HashMap::new(),
40             inst_pred_reg: InstructionPredicateRegistry::new(),
41         }
42     }
43 
add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber44     fn add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber {
45         if let Some(found_index) = self.recipes_by_name.get(&recipe.name) {
46             assert!(
47                 self.recipes[*found_index] == recipe,
48                 "trying to insert different recipes with a same name ({})",
49                 recipe.name
50             );
51             *found_index
52         } else {
53             let recipe_name = recipe.name.clone();
54             let index = self.recipes.push(recipe);
55             self.recipes_by_name.insert(recipe_name, index);
56             index
57         }
58     }
59 
make_encoding<T>( &mut self, inst: InstSpec, template: Template, builder_closure: T, ) -> Encoding where T: FnOnce(EncodingBuilder) -> EncodingBuilder,60     fn make_encoding<T>(
61         &mut self,
62         inst: InstSpec,
63         template: Template,
64         builder_closure: T,
65     ) -> Encoding
66     where
67         T: FnOnce(EncodingBuilder) -> EncodingBuilder,
68     {
69         let (recipe, bits) = template.build();
70         let recipe_number = self.add_recipe(recipe);
71         let builder = EncodingBuilder::new(inst, recipe_number, bits);
72         builder_closure(builder).build(&self.recipes, &mut self.inst_pred_reg)
73     }
74 
enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,75     fn enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
76     where
77         T: FnOnce(EncodingBuilder) -> EncodingBuilder,
78     {
79         let encoding = self.make_encoding(inst.into(), template, builder_closure);
80         self.enc32.push(encoding);
81     }
enc32(&mut self, inst: impl Into<InstSpec>, template: Template)82     fn enc32(&mut self, inst: impl Into<InstSpec>, template: Template) {
83         self.enc32_func(inst, template, |x| x);
84     }
enc32_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )85     fn enc32_isap(
86         &mut self,
87         inst: impl Into<InstSpec>,
88         template: Template,
89         isap: SettingPredicateNumber,
90     ) {
91         self.enc32_func(inst, template, |encoding| encoding.isa_predicate(isap));
92     }
enc32_instp( &mut self, inst: impl Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )93     fn enc32_instp(
94         &mut self,
95         inst: impl Into<InstSpec>,
96         template: Template,
97         instp: InstructionPredicateNode,
98     ) {
99         self.enc32_func(inst, template, |encoding| encoding.inst_predicate(instp));
100     }
enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16)101     fn enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
102         let recipe_number = self.add_recipe(recipe.clone());
103         let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
104         let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
105         self.enc32.push(encoding);
106     }
107 
enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,108     fn enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
109     where
110         T: FnOnce(EncodingBuilder) -> EncodingBuilder,
111     {
112         let encoding = self.make_encoding(inst.into(), template, builder_closure);
113         self.enc64.push(encoding);
114     }
enc64(&mut self, inst: impl Into<InstSpec>, template: Template)115     fn enc64(&mut self, inst: impl Into<InstSpec>, template: Template) {
116         self.enc64_func(inst, template, |x| x);
117     }
enc64_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )118     fn enc64_isap(
119         &mut self,
120         inst: impl Into<InstSpec>,
121         template: Template,
122         isap: SettingPredicateNumber,
123     ) {
124         self.enc64_func(inst, template, |encoding| encoding.isa_predicate(isap));
125     }
enc64_instp( &mut self, inst: impl Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )126     fn enc64_instp(
127         &mut self,
128         inst: impl Into<InstSpec>,
129         template: Template,
130         instp: InstructionPredicateNode,
131     ) {
132         self.enc64_func(inst, template, |encoding| encoding.inst_predicate(instp));
133     }
enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16)134     fn enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
135         let recipe_number = self.add_recipe(recipe.clone());
136         let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
137         let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
138         self.enc64.push(encoding);
139     }
140 
141     /// Adds I32/I64 encodings as appropriate for a typed instruction.
142     /// The REX prefix is always inferred at runtime.
143     ///
144     /// Add encodings for `inst.i32` to X86_32.
145     /// Add encodings for `inst.i32` to X86_64 with optional, inferred REX.
146     /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template)147     fn enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template) {
148         let inst: InstSpec = inst.into();
149 
150         // I32 on x86: no REX prefix.
151         self.enc32(inst.bind(I32), template.infer_rex());
152 
153         // I32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
154         self.enc64(inst.bind(I32), template.infer_rex());
155 
156         // I64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
157         self.enc64(inst.bind(I64), template.rex().w());
158     }
159 
160     /// Adds I32/I64 encodings as appropriate for a typed instruction.
161     /// All variants of REX prefix are explicitly emitted, not inferred.
162     ///
163     /// Add encodings for `inst.i32` to X86_32.
164     /// Add encodings for `inst.i32` to X86_64 with and without REX.
165     /// Add encodings for `inst.i64` to X86_64 with and without REX.
enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template)166     fn enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template) {
167         let inst: InstSpec = inst.into();
168         self.enc32(inst.bind(I32), template.nonrex());
169 
170         // REX-less encoding must come after REX encoding so we don't use it by default.
171         // Otherwise reg-alloc would never use r8 and up.
172         self.enc64(inst.bind(I32), template.rex());
173         self.enc64(inst.bind(I32), template.nonrex());
174         self.enc64(inst.bind(I64), template.rex().w());
175     }
176 
177     /// Adds B32/B64 encodings as appropriate for a typed instruction.
178     /// The REX prefix is always inferred at runtime.
179     ///
180     /// Adds encoding for `inst.b32` to X86_32.
181     /// Adds encoding for `inst.b32` to X86_64 with optional, inferred REX.
182     /// Adds encoding for `inst.b64` to X86_64 with a REX.W prefix.
enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template)183     fn enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template) {
184         let inst: InstSpec = inst.into();
185 
186         // B32 on x86: no REX prefix.
187         self.enc32(inst.bind(B32), template.infer_rex());
188 
189         // B32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
190         self.enc64(inst.bind(B32), template.infer_rex());
191 
192         // B64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
193         self.enc64(inst.bind(B64), template.rex().w());
194     }
195 
196     /// Add encodings for `inst.i32` to X86_32.
197     /// Add encodings for `inst.i32` to X86_64 with a REX prefix.
198     /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template)199     fn enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
200         let inst: InstSpec = inst.into();
201         self.enc32(inst.bind(I32), template.nonrex());
202         self.enc64(inst.bind(I32), template.rex());
203         self.enc64(inst.bind(I64), template.rex().w());
204     }
205 
206     /// Add encodings for `inst.i32` to X86_32.
207     /// Add encodings for `inst.i32` to X86_64 with and without REX.
208     /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64_instp( &mut self, inst: &Instruction, template: Template, instp: InstructionPredicateNode, )209     fn enc_i32_i64_instp(
210         &mut self,
211         inst: &Instruction,
212         template: Template,
213         instp: InstructionPredicateNode,
214     ) {
215         self.enc32_func(inst.bind(I32), template.nonrex(), |builder| {
216             builder.inst_predicate(instp.clone())
217         });
218 
219         // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
220         // reg-alloc would never use r8 and up.
221         self.enc64_func(inst.bind(I32), template.rex(), |builder| {
222             builder.inst_predicate(instp.clone())
223         });
224         self.enc64_func(inst.bind(I32), template.nonrex(), |builder| {
225             builder.inst_predicate(instp.clone())
226         });
227         self.enc64_func(inst.bind(I64), template.rex().w(), |builder| {
228             builder.inst_predicate(instp)
229         });
230     }
231 
232     /// Add encodings for `inst.r32` to X86_32.
233     /// Add encodings for `inst.r32` to X86_64 with and without REX.
234     /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
enc_r32_r64_instp( &mut self, inst: &Instruction, template: Template, instp: InstructionPredicateNode, )235     fn enc_r32_r64_instp(
236         &mut self,
237         inst: &Instruction,
238         template: Template,
239         instp: InstructionPredicateNode,
240     ) {
241         self.enc32_func(inst.bind(R32), template.nonrex(), |builder| {
242             builder.inst_predicate(instp.clone())
243         });
244 
245         // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
246         // reg-alloc would never use r8 and up.
247         self.enc64_func(inst.bind(R32), template.rex(), |builder| {
248             builder.inst_predicate(instp.clone())
249         });
250         self.enc64_func(inst.bind(R32), template.nonrex(), |builder| {
251             builder.inst_predicate(instp.clone())
252         });
253         self.enc64_func(inst.bind(R64), template.rex().w(), |builder| {
254             builder.inst_predicate(instp)
255         });
256     }
257 
258     /// Add encodings for `inst.r32` to X86_32.
259     /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template)260     fn enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
261         let inst: InstSpec = inst.into();
262         self.enc32(inst.bind(R32), template.nonrex());
263         self.enc64(inst.bind(R64), template.rex().w());
264     }
265 
enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template)266     fn enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
267         self.enc32(inst.clone().bind(R32).bind(Any), template.clone());
268 
269         // REX-less encoding must come after REX encoding so we don't use it by
270         // default. Otherwise reg-alloc would never use r8 and up.
271         self.enc64(inst.clone().bind(R32).bind(Any), template.clone().rex());
272         self.enc64(inst.clone().bind(R32).bind(Any), template.clone());
273 
274         if w_bit {
275             self.enc64(inst.clone().bind(R64).bind(Any), template.rex().w());
276         } else {
277             self.enc64(inst.clone().bind(R64).bind(Any), template.clone().rex());
278             self.enc64(inst.clone().bind(R64).bind(Any), template);
279         }
280     }
281 
282     /// Add encodings for `inst` to X86_64 with and without a REX prefix.
enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template)283     fn enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template) {
284         // See above comment about the ordering of rex vs non-rex encodings.
285         self.enc64(inst.clone(), template.rex());
286         self.enc64(inst, template);
287     }
288 
289     /// Add encodings for `inst` to X86_64 with and without a REX prefix.
enc_x86_64_instp( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )290     fn enc_x86_64_instp(
291         &mut self,
292         inst: impl Clone + Into<InstSpec>,
293         template: Template,
294         instp: InstructionPredicateNode,
295     ) {
296         // See above comment about the ordering of rex vs non-rex encodings.
297         self.enc64_func(inst.clone(), template.rex(), |builder| {
298             builder.inst_predicate(instp.clone())
299         });
300         self.enc64_func(inst, template, |builder| builder.inst_predicate(instp));
301     }
enc_x86_64_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )302     fn enc_x86_64_isap(
303         &mut self,
304         inst: impl Clone + Into<InstSpec>,
305         template: Template,
306         isap: SettingPredicateNumber,
307     ) {
308         // See above comment about the ordering of rex vs non-rex encodings.
309         self.enc64_isap(inst.clone(), template.rex(), isap);
310         self.enc64_isap(inst, template, isap);
311     }
312 
313     /// Add all three encodings for `inst`:
314     /// - X86_32
315     /// - X86_64 with and without the REX prefix.
enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)316     fn enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
317         self.enc32(inst.clone(), template.clone());
318         self.enc_x86_64(inst, template);
319     }
enc_both_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )320     fn enc_both_isap(
321         &mut self,
322         inst: impl Clone + Into<InstSpec>,
323         template: Template,
324         isap: SettingPredicateNumber,
325     ) {
326         self.enc32_isap(inst.clone(), template.clone(), isap);
327         self.enc_x86_64_isap(inst, template, isap);
328     }
enc_both_instp( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )329     fn enc_both_instp(
330         &mut self,
331         inst: impl Clone + Into<InstSpec>,
332         template: Template,
333         instp: InstructionPredicateNode,
334     ) {
335         self.enc32_instp(inst.clone(), template.clone(), instp.clone());
336         self.enc_x86_64_instp(inst, template, instp);
337     }
338 
339     /// Add two encodings for `inst`:
340     /// - X86_32, no REX prefix, since this is not valid in 32-bit mode.
341     /// - X86_64, dynamically infer the REX prefix.
enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)342     fn enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
343         self.enc32(inst.clone(), template.clone());
344         self.enc64(inst, template.infer_rex());
345     }
enc_both_inferred_maybe_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )346     fn enc_both_inferred_maybe_isap(
347         &mut self,
348         inst: impl Clone + Into<InstSpec>,
349         template: Template,
350         isap: Option<SettingPredicateNumber>,
351     ) {
352         self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
353         self.enc64_maybe_isap(inst, template.infer_rex(), isap);
354     }
355 
356     /// Add two encodings for `inst`:
357     /// - X86_32
358     /// - X86_64 with the REX prefix.
enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)359     fn enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
360         self.enc32(inst.clone(), template.clone());
361         self.enc64(inst, template.rex());
362     }
363 
364     /// Add encodings for `inst.i32` to X86_32.
365     /// Add encodings for `inst.i32` to X86_64 with and without REX.
366     /// Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit`
367     /// argument to determine whether or not to set the REX.W bit.
enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template)368     fn enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
369         self.enc32(inst.clone().bind(I32).bind(Any), template.clone());
370 
371         // REX-less encoding must come after REX encoding so we don't use it by
372         // default. Otherwise reg-alloc would never use r8 and up.
373         self.enc64(inst.clone().bind(I32).bind(Any), template.clone().rex());
374         self.enc64(inst.clone().bind(I32).bind(Any), template.clone());
375 
376         if w_bit {
377             self.enc64(inst.clone().bind(I64).bind(Any), template.rex().w());
378         } else {
379             self.enc64(inst.clone().bind(I64).bind(Any), template.clone().rex());
380             self.enc64(inst.clone().bind(I64).bind(Any), template);
381         }
382     }
383 
384     /// Add the same encoding/recipe pairing to both X86_32 and X86_64
enc_32_64_rec( &mut self, inst: impl Clone + Into<InstSpec>, recipe: &EncodingRecipe, bits: u16, )385     fn enc_32_64_rec(
386         &mut self,
387         inst: impl Clone + Into<InstSpec>,
388         recipe: &EncodingRecipe,
389         bits: u16,
390     ) {
391         self.enc32_rec(inst.clone(), recipe, bits);
392         self.enc64_rec(inst, recipe, bits);
393     }
394 
395     /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
enc_32_64_func<T>( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, builder_closure: T, ) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,396     fn enc_32_64_func<T>(
397         &mut self,
398         inst: impl Clone + Into<InstSpec>,
399         template: Template,
400         builder_closure: T,
401     ) where
402         T: FnOnce(EncodingBuilder) -> EncodingBuilder,
403     {
404         let encoding = self.make_encoding(inst.into(), template, builder_closure);
405         self.enc32.push(encoding.clone());
406         self.enc64.push(encoding);
407     }
408 
409     /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
410     /// binding) has already happened.
enc_32_64_maybe_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )411     fn enc_32_64_maybe_isap(
412         &mut self,
413         inst: impl Clone + Into<InstSpec>,
414         template: Template,
415         isap: Option<SettingPredicateNumber>,
416     ) {
417         self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
418         self.enc64_maybe_isap(inst, template, isap);
419     }
420 
enc32_maybe_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )421     fn enc32_maybe_isap(
422         &mut self,
423         inst: impl Into<InstSpec>,
424         template: Template,
425         isap: Option<SettingPredicateNumber>,
426     ) {
427         match isap {
428             None => self.enc32(inst, template),
429             Some(isap) => self.enc32_isap(inst, template, isap),
430         }
431     }
432 
enc64_maybe_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )433     fn enc64_maybe_isap(
434         &mut self,
435         inst: impl Into<InstSpec>,
436         template: Template,
437         isap: Option<SettingPredicateNumber>,
438     ) {
439         match isap {
440             None => self.enc64(inst, template),
441             Some(isap) => self.enc64_isap(inst, template, isap),
442         }
443     }
444 }
445 
446 // Definitions.
447 
448 #[inline(never)]
define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)449 fn define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
450     let shared = &shared_defs.instructions;
451     let formats = &shared_defs.formats;
452 
453     // Shorthands for instructions.
454     let bconst = shared.by_name("bconst");
455     let bint = shared.by_name("bint");
456     let copy = shared.by_name("copy");
457     let copy_special = shared.by_name("copy_special");
458     let copy_to_ssa = shared.by_name("copy_to_ssa");
459     let get_pinned_reg = shared.by_name("get_pinned_reg");
460     let iconst = shared.by_name("iconst");
461     let ireduce = shared.by_name("ireduce");
462     let regmove = shared.by_name("regmove");
463     let sextend = shared.by_name("sextend");
464     let set_pinned_reg = shared.by_name("set_pinned_reg");
465     let uextend = shared.by_name("uextend");
466     let dummy_sarg_t = shared.by_name("dummy_sarg_t");
467 
468     // Shorthands for recipes.
469     let rec_copysp = r.template("copysp");
470     let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
471     let rec_get_pinned_reg = r.recipe("get_pinned_reg");
472     let rec_null = r.recipe("null");
473     let rec_pu_id = r.template("pu_id");
474     let rec_pu_id_bool = r.template("pu_id_bool");
475     let rec_pu_iq = r.template("pu_iq");
476     let rec_rmov = r.template("rmov");
477     let rec_set_pinned_reg = r.template("set_pinned_reg");
478     let rec_u_id = r.template("u_id");
479     let rec_u_id_z = r.template("u_id_z");
480     let rec_umr = r.template("umr");
481     let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
482     let rec_urm_noflags = r.template("urm_noflags");
483     let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
484     let rec_dummy_sarg_t = r.recipe("dummy_sarg_t");
485 
486     // The pinned reg is fixed to a certain value entirely user-controlled, so it generates nothing!
487     e.enc64_rec(get_pinned_reg.bind(I64), rec_get_pinned_reg, 0);
488     e.enc_x86_64(
489         set_pinned_reg.bind(I64),
490         rec_set_pinned_reg.opcodes(&MOV_STORE).rex().w(),
491     );
492 
493     e.enc_i32_i64(copy, rec_umr.opcodes(&MOV_STORE));
494     e.enc_r32_r64_rex_only(copy, rec_umr.opcodes(&MOV_STORE));
495     e.enc_both(copy.bind(B1), rec_umr.opcodes(&MOV_STORE));
496     e.enc_both(copy.bind(I8), rec_umr.opcodes(&MOV_STORE));
497     e.enc_both(copy.bind(I16), rec_umr.opcodes(&MOV_STORE));
498 
499     // TODO For x86-64, only define REX forms for now, since we can't describe the
500     // special regunit immediate operands with the current constraint language.
501     for &ty in &[I8, I16, I32] {
502         e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
503         e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
504     }
505     for &ty in &[B8, B16, B32] {
506         e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
507         e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
508     }
509     e.enc64(regmove.bind(I64), rec_rmov.opcodes(&MOV_STORE).rex().w());
510     e.enc_both(regmove.bind(B1), rec_rmov.opcodes(&MOV_STORE));
511     e.enc_both(regmove.bind(I8), rec_rmov.opcodes(&MOV_STORE));
512     e.enc32(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE));
513     e.enc64(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE).rex());
514     e.enc64(regmove.bind(R64), rec_rmov.opcodes(&MOV_STORE).rex().w());
515 
516     // Immediate constants.
517     e.enc32(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
518 
519     e.enc64(iconst.bind(I32), rec_pu_id.rex().opcodes(&MOV_IMM));
520     e.enc64(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
521 
522     // The 32-bit immediate movl also zero-extends to 64 bits.
523     let is_unsigned_int32 =
524         InstructionPredicate::new_is_unsigned_int(&*formats.unary_imm, "imm", 32, 0);
525 
526     e.enc64_func(
527         iconst.bind(I64),
528         rec_pu_id.opcodes(&MOV_IMM).rex(),
529         |encoding| encoding.inst_predicate(is_unsigned_int32.clone()),
530     );
531     e.enc64_func(iconst.bind(I64), rec_pu_id.opcodes(&MOV_IMM), |encoding| {
532         encoding.inst_predicate(is_unsigned_int32)
533     });
534 
535     // Sign-extended 32-bit immediate.
536     e.enc64(
537         iconst.bind(I64),
538         rec_u_id.rex().opcodes(&MOV_IMM_SIGNEXTEND).rrr(0).w(),
539     );
540 
541     // Finally, the MOV_IMM opcode takes an 8-byte immediate with a REX.W prefix.
542     e.enc64(iconst.bind(I64), rec_pu_iq.opcodes(&MOV_IMM).rex().w());
543 
544     // Bool constants (uses MOV)
545     for &ty in &[B1, B8, B16, B32] {
546         e.enc_both(bconst.bind(ty), rec_pu_id_bool.opcodes(&MOV_IMM));
547     }
548     e.enc64(bconst.bind(B64), rec_pu_id_bool.opcodes(&MOV_IMM).rex());
549 
550     // You may expect that i8 encodings would use 0x30 (XORB) to indicate that encodings should be
551     // on 8-bit operands (f.ex "xor %al, %al"). Cranelift currently does not know when it can
552     // safely drop the 0x66 prefix, so we explicitly select a wider but permissible opcode.
553     let is_zero_int = InstructionPredicate::new_is_zero_int(&formats.unary_imm, "imm");
554     e.enc_both_instp(
555         iconst.bind(I8),
556         rec_u_id_z.opcodes(&XOR),
557         is_zero_int.clone(),
558     );
559 
560     // You may expect that i16 encodings would have an 0x66 prefix on the opcode to indicate that
561     // encodings should be on 16-bit operands (f.ex, "xor %ax, %ax"). Cranelift currently does not
562     // know that it can drop the 0x66 prefix and clear the upper half of a 32-bit register in these
563     // scenarios, so we explicitly select a wider but permissible opcode.
564     //
565     // This effectively formalizes the i16->i32 widening that Cranelift performs when there isn't
566     // an appropriate i16 encoding available.
567     e.enc_both_instp(
568         iconst.bind(I16),
569         rec_u_id_z.opcodes(&XOR),
570         is_zero_int.clone(),
571     );
572     e.enc_both_instp(
573         iconst.bind(I32),
574         rec_u_id_z.opcodes(&XOR),
575         is_zero_int.clone(),
576     );
577     e.enc_x86_64_instp(iconst.bind(I64), rec_u_id_z.opcodes(&XOR), is_zero_int);
578 
579     // Numerical conversions.
580 
581     // Reducing an integer is a no-op.
582     e.enc32_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
583     e.enc32_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
584     e.enc32_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
585 
586     e.enc64_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
587     e.enc64_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
588     e.enc64_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
589     e.enc64_rec(ireduce.bind(I8).bind(I64), rec_null, 0);
590     e.enc64_rec(ireduce.bind(I16).bind(I64), rec_null, 0);
591     e.enc64_rec(ireduce.bind(I32).bind(I64), rec_null, 0);
592 
593     // TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending
594     // instructions for %al/%ax/%eax to %ax/%eax/%rax.
595 
596     // movsbl
597     e.enc32(
598         sextend.bind(I32).bind(I8),
599         rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
600     );
601     e.enc64(
602         sextend.bind(I32).bind(I8),
603         rec_urm_noflags.opcodes(&MOVSX_BYTE).rex(),
604     );
605     e.enc64(
606         sextend.bind(I32).bind(I8),
607         rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
608     );
609 
610     // movswl
611     e.enc32(
612         sextend.bind(I32).bind(I16),
613         rec_urm_noflags.opcodes(&MOVSX_WORD),
614     );
615     e.enc64(
616         sextend.bind(I32).bind(I16),
617         rec_urm_noflags.opcodes(&MOVSX_WORD).rex(),
618     );
619     e.enc64(
620         sextend.bind(I32).bind(I16),
621         rec_urm_noflags.opcodes(&MOVSX_WORD),
622     );
623 
624     // movsbq
625     e.enc64(
626         sextend.bind(I64).bind(I8),
627         rec_urm_noflags.opcodes(&MOVSX_BYTE).rex().w(),
628     );
629 
630     // movswq
631     e.enc64(
632         sextend.bind(I64).bind(I16),
633         rec_urm_noflags.opcodes(&MOVSX_WORD).rex().w(),
634     );
635 
636     // movslq
637     e.enc64(
638         sextend.bind(I64).bind(I32),
639         rec_urm_noflags.opcodes(&MOVSXD).rex().w(),
640     );
641 
642     // movzbl
643     e.enc32(
644         uextend.bind(I32).bind(I8),
645         rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
646     );
647     e.enc64(
648         uextend.bind(I32).bind(I8),
649         rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
650     );
651     e.enc64(
652         uextend.bind(I32).bind(I8),
653         rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
654     );
655 
656     // movzwl
657     e.enc32(
658         uextend.bind(I32).bind(I16),
659         rec_urm_noflags.opcodes(&MOVZX_WORD),
660     );
661     e.enc64(
662         uextend.bind(I32).bind(I16),
663         rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
664     );
665     e.enc64(
666         uextend.bind(I32).bind(I16),
667         rec_urm_noflags.opcodes(&MOVZX_WORD),
668     );
669 
670     // movzbq, encoded as movzbl because it's equivalent and shorter.
671     e.enc64(
672         uextend.bind(I64).bind(I8),
673         rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
674     );
675     e.enc64(
676         uextend.bind(I64).bind(I8),
677         rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
678     );
679 
680     // movzwq, encoded as movzwl because it's equivalent and shorter
681     e.enc64(
682         uextend.bind(I64).bind(I16),
683         rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
684     );
685     e.enc64(
686         uextend.bind(I64).bind(I16),
687         rec_urm_noflags.opcodes(&MOVZX_WORD),
688     );
689 
690     // A 32-bit register copy clears the high 32 bits.
691     e.enc64(
692         uextend.bind(I64).bind(I32),
693         rec_umr.opcodes(&MOV_STORE).rex(),
694     );
695     e.enc64(uextend.bind(I64).bind(I32), rec_umr.opcodes(&MOV_STORE));
696 
697     // Convert bool to int.
698     //
699     // This assumes that b1 is represented as an 8-bit low register with the value 0
700     // or 1.
701     //
702     // Encode movzbq as movzbl, because it's equivalent and shorter.
703     for &to in &[I8, I16, I32, I64] {
704         for &from in &[B1, B8] {
705             e.enc64(
706                 bint.bind(to).bind(from),
707                 rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
708             );
709             e.enc64(
710                 bint.bind(to).bind(from),
711                 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
712             );
713             if to != I64 {
714                 e.enc32(
715                     bint.bind(to).bind(from),
716                     rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
717                 );
718             }
719         }
720     }
721     for (to, from) in &[(I16, B16), (I32, B32), (I64, B64)] {
722         e.enc_both(
723             bint.bind(*to).bind(*from),
724             rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
725         );
726     }
727 
728     // Copy Special
729     // For x86-64, only define REX forms for now, since we can't describe the
730     // special regunit immediate operands with the current constraint language.
731     e.enc64(copy_special, rec_copysp.opcodes(&MOV_STORE).rex().w());
732     e.enc32(copy_special, rec_copysp.opcodes(&MOV_STORE));
733 
734     // Copy to SSA.  These have to be done with special _rex_only encoders, because the standard
735     // machinery for deciding whether a REX.{RXB} prefix is needed doesn't take into account
736     // the source register, which is specified directly in the instruction.
737     e.enc_i32_i64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
738     e.enc_r32_r64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
739     e.enc_both_rex_only(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
740     e.enc_both_rex_only(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
741     e.enc_both_rex_only(
742         copy_to_ssa.bind(I16),
743         rec_umr_reg_to_ssa.opcodes(&MOV_STORE),
744     );
745     e.enc_both_rex_only(
746         copy_to_ssa.bind(F64),
747         rec_furm_reg_to_ssa.opcodes(&MOVSD_LOAD),
748     );
749     e.enc_both_rex_only(
750         copy_to_ssa.bind(F32),
751         rec_furm_reg_to_ssa.opcodes(&MOVSS_LOAD),
752     );
753 
754     e.enc_32_64_rec(dummy_sarg_t, rec_dummy_sarg_t, 0);
755 }
756 
757 #[inline(never)]
define_memory( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, x86: &InstructionGroup, r: &RecipeGroup, )758 fn define_memory(
759     e: &mut PerCpuModeEncodings,
760     shared_defs: &SharedDefinitions,
761     x86: &InstructionGroup,
762     r: &RecipeGroup,
763 ) {
764     let shared = &shared_defs.instructions;
765     let formats = &shared_defs.formats;
766 
767     // Shorthands for instructions.
768     let adjust_sp_down = shared.by_name("adjust_sp_down");
769     let adjust_sp_down_imm = shared.by_name("adjust_sp_down_imm");
770     let adjust_sp_up_imm = shared.by_name("adjust_sp_up_imm");
771     let copy_nop = shared.by_name("copy_nop");
772     let fill = shared.by_name("fill");
773     let fill_nop = shared.by_name("fill_nop");
774     let istore16 = shared.by_name("istore16");
775     let istore16_complex = shared.by_name("istore16_complex");
776     let istore32 = shared.by_name("istore32");
777     let istore32_complex = shared.by_name("istore32_complex");
778     let istore8 = shared.by_name("istore8");
779     let istore8_complex = shared.by_name("istore8_complex");
780     let load = shared.by_name("load");
781     let load_complex = shared.by_name("load_complex");
782     let regfill = shared.by_name("regfill");
783     let regspill = shared.by_name("regspill");
784     let sload16 = shared.by_name("sload16");
785     let sload16_complex = shared.by_name("sload16_complex");
786     let sload32 = shared.by_name("sload32");
787     let sload32_complex = shared.by_name("sload32_complex");
788     let sload8 = shared.by_name("sload8");
789     let sload8_complex = shared.by_name("sload8_complex");
790     let spill = shared.by_name("spill");
791     let store = shared.by_name("store");
792     let store_complex = shared.by_name("store_complex");
793     let uload16 = shared.by_name("uload16");
794     let uload16_complex = shared.by_name("uload16_complex");
795     let uload32 = shared.by_name("uload32");
796     let uload32_complex = shared.by_name("uload32_complex");
797     let uload8 = shared.by_name("uload8");
798     let uload8_complex = shared.by_name("uload8_complex");
799     let x86_pop = x86.by_name("x86_pop");
800     let x86_push = x86.by_name("x86_push");
801 
802     // Shorthands for recipes.
803     let rec_adjustsp = r.template("adjustsp");
804     let rec_adjustsp_ib = r.template("adjustsp_ib");
805     let rec_adjustsp_id = r.template("adjustsp_id");
806     let rec_ffillnull = r.recipe("ffillnull");
807     let rec_fillnull = r.recipe("fillnull");
808     let rec_fillSib32 = r.template("fillSib32");
809     let rec_ld = r.template("ld");
810     let rec_ldDisp32 = r.template("ldDisp32");
811     let rec_ldDisp8 = r.template("ldDisp8");
812     let rec_ldWithIndex = r.template("ldWithIndex");
813     let rec_ldWithIndexDisp32 = r.template("ldWithIndexDisp32");
814     let rec_ldWithIndexDisp8 = r.template("ldWithIndexDisp8");
815     let rec_popq = r.template("popq");
816     let rec_pushq = r.template("pushq");
817     let rec_regfill32 = r.template("regfill32");
818     let rec_regspill32 = r.template("regspill32");
819     let rec_spillSib32 = r.template("spillSib32");
820     let rec_st = r.template("st");
821     let rec_stacknull = r.recipe("stacknull");
822     let rec_stDisp32 = r.template("stDisp32");
823     let rec_stDisp32_abcd = r.template("stDisp32_abcd");
824     let rec_stDisp8 = r.template("stDisp8");
825     let rec_stDisp8_abcd = r.template("stDisp8_abcd");
826     let rec_stWithIndex = r.template("stWithIndex");
827     let rec_stWithIndexDisp32 = r.template("stWithIndexDisp32");
828     let rec_stWithIndexDisp32_abcd = r.template("stWithIndexDisp32_abcd");
829     let rec_stWithIndexDisp8 = r.template("stWithIndexDisp8");
830     let rec_stWithIndexDisp8_abcd = r.template("stWithIndexDisp8_abcd");
831     let rec_stWithIndex_abcd = r.template("stWithIndex_abcd");
832     let rec_st_abcd = r.template("st_abcd");
833 
834     // Loads and stores.
835     let is_load_complex_length_two =
836         InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
837 
838     for recipe in &[rec_ldWithIndex, rec_ldWithIndexDisp8, rec_ldWithIndexDisp32] {
839         e.enc_i32_i64_instp(
840             load_complex,
841             recipe.opcodes(&MOV_LOAD),
842             is_load_complex_length_two.clone(),
843         );
844         e.enc_r32_r64_instp(
845             load_complex,
846             recipe.opcodes(&MOV_LOAD),
847             is_load_complex_length_two.clone(),
848         );
849         e.enc_x86_64_instp(
850             uload32_complex,
851             recipe.opcodes(&MOV_LOAD),
852             is_load_complex_length_two.clone(),
853         );
854 
855         e.enc64_instp(
856             sload32_complex,
857             recipe.opcodes(&MOVSXD).rex().w(),
858             is_load_complex_length_two.clone(),
859         );
860 
861         e.enc_i32_i64_instp(
862             uload16_complex,
863             recipe.opcodes(&MOVZX_WORD),
864             is_load_complex_length_two.clone(),
865         );
866         e.enc_i32_i64_instp(
867             sload16_complex,
868             recipe.opcodes(&MOVSX_WORD),
869             is_load_complex_length_two.clone(),
870         );
871 
872         e.enc_i32_i64_instp(
873             uload8_complex,
874             recipe.opcodes(&MOVZX_BYTE),
875             is_load_complex_length_two.clone(),
876         );
877 
878         e.enc_i32_i64_instp(
879             sload8_complex,
880             recipe.opcodes(&MOVSX_BYTE),
881             is_load_complex_length_two.clone(),
882         );
883     }
884 
885     let is_store_complex_length_three =
886         InstructionPredicate::new_length_equals(&*formats.store_complex, 3);
887 
888     for recipe in &[rec_stWithIndex, rec_stWithIndexDisp8, rec_stWithIndexDisp32] {
889         e.enc_i32_i64_instp(
890             store_complex,
891             recipe.opcodes(&MOV_STORE),
892             is_store_complex_length_three.clone(),
893         );
894         e.enc_r32_r64_instp(
895             store_complex,
896             recipe.opcodes(&MOV_STORE),
897             is_store_complex_length_three.clone(),
898         );
899         e.enc_x86_64_instp(
900             istore32_complex,
901             recipe.opcodes(&MOV_STORE),
902             is_store_complex_length_three.clone(),
903         );
904         e.enc_both_instp(
905             istore16_complex.bind(I32),
906             recipe.opcodes(&MOV_STORE_16),
907             is_store_complex_length_three.clone(),
908         );
909         e.enc_x86_64_instp(
910             istore16_complex.bind(I64),
911             recipe.opcodes(&MOV_STORE_16),
912             is_store_complex_length_three.clone(),
913         );
914     }
915 
916     for recipe in &[
917         rec_stWithIndex_abcd,
918         rec_stWithIndexDisp8_abcd,
919         rec_stWithIndexDisp32_abcd,
920     ] {
921         e.enc_both_instp(
922             istore8_complex.bind(I32),
923             recipe.opcodes(&MOV_BYTE_STORE),
924             is_store_complex_length_three.clone(),
925         );
926         e.enc_x86_64_instp(
927             istore8_complex.bind(I64),
928             recipe.opcodes(&MOV_BYTE_STORE),
929             is_store_complex_length_three.clone(),
930         );
931     }
932 
933     for recipe in &[rec_st, rec_stDisp8, rec_stDisp32] {
934         e.enc_i32_i64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
935         e.enc_r32_r64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
936         e.enc_x86_64(istore32.bind(I64).bind(Any), recipe.opcodes(&MOV_STORE));
937         e.enc_i32_i64_ld_st(istore16, false, recipe.opcodes(&MOV_STORE_16));
938     }
939 
940     // Byte stores are more complicated because the registers they can address
941     // depends of the presence of a REX prefix. The st*_abcd recipes fall back to
942     // the corresponding st* recipes when a REX prefix is applied.
943 
944     for recipe in &[rec_st_abcd, rec_stDisp8_abcd, rec_stDisp32_abcd] {
945         e.enc_both(istore8.bind(I32).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
946         e.enc_x86_64(istore8.bind(I64).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
947     }
948 
949     e.enc_i32_i64_explicit_rex(spill, rec_spillSib32.opcodes(&MOV_STORE));
950     e.enc_i32_i64_explicit_rex(regspill, rec_regspill32.opcodes(&MOV_STORE));
951     e.enc_r32_r64_rex_only(spill, rec_spillSib32.opcodes(&MOV_STORE));
952     e.enc_r32_r64_rex_only(regspill, rec_regspill32.opcodes(&MOV_STORE));
953 
954     // Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
955     // constraining the permitted registers.
956     // See MIN_SPILL_SLOT_SIZE which makes this safe.
957 
958     e.enc_both(spill.bind(B1), rec_spillSib32.opcodes(&MOV_STORE));
959     e.enc_both(regspill.bind(B1), rec_regspill32.opcodes(&MOV_STORE));
960     for &ty in &[I8, I16] {
961         e.enc_both(spill.bind(ty), rec_spillSib32.opcodes(&MOV_STORE));
962         e.enc_both(regspill.bind(ty), rec_regspill32.opcodes(&MOV_STORE));
963     }
964 
965     for recipe in &[rec_ld, rec_ldDisp8, rec_ldDisp32] {
966         e.enc_i32_i64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
967         e.enc_r32_r64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
968         e.enc_x86_64(uload32.bind(I64), recipe.opcodes(&MOV_LOAD));
969         e.enc64(sload32.bind(I64), recipe.opcodes(&MOVSXD).rex().w());
970         e.enc_i32_i64_ld_st(uload16, true, recipe.opcodes(&MOVZX_WORD));
971         e.enc_i32_i64_ld_st(sload16, true, recipe.opcodes(&MOVSX_WORD));
972         e.enc_i32_i64_ld_st(uload8, true, recipe.opcodes(&MOVZX_BYTE));
973         e.enc_i32_i64_ld_st(sload8, true, recipe.opcodes(&MOVSX_BYTE));
974     }
975 
976     e.enc_i32_i64_explicit_rex(fill, rec_fillSib32.opcodes(&MOV_LOAD));
977     e.enc_i32_i64_explicit_rex(regfill, rec_regfill32.opcodes(&MOV_LOAD));
978     e.enc_r32_r64_rex_only(fill, rec_fillSib32.opcodes(&MOV_LOAD));
979     e.enc_r32_r64_rex_only(regfill, rec_regfill32.opcodes(&MOV_LOAD));
980 
981     // No-op fills, created by late-stage redundant-fill removal.
982     for &ty in &[I64, I32, I16, I8] {
983         e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
984         e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
985     }
986     e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0);
987     e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0);
988     for &ty in &[F64, F32] {
989         e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0);
990         e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0);
991     }
992     for &ty in &[R64, R32] {
993         e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
994         e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
995     }
996 
997     // Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
998 
999     e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(&MOV_LOAD));
1000     e.enc_both(regfill.bind(B1), rec_regfill32.opcodes(&MOV_LOAD));
1001     for &ty in &[I8, I16] {
1002         e.enc_both(fill.bind(ty), rec_fillSib32.opcodes(&MOV_LOAD));
1003         e.enc_both(regfill.bind(ty), rec_regfill32.opcodes(&MOV_LOAD));
1004     }
1005 
1006     // Push and Pop.
1007     e.enc32(x86_push.bind(I32), rec_pushq.opcodes(&PUSH_REG));
1008     e.enc_x86_64(x86_push.bind(I64), rec_pushq.opcodes(&PUSH_REG));
1009 
1010     e.enc32(x86_pop.bind(I32), rec_popq.opcodes(&POP_REG));
1011     e.enc_x86_64(x86_pop.bind(I64), rec_popq.opcodes(&POP_REG));
1012 
1013     // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
1014     // into a no-op.
1015     // The same encoding is generated for both the 64- and 32-bit architectures.
1016     for &ty in &[I64, I32, I16, I8] {
1017         e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
1018         e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
1019     }
1020     for &ty in &[F64, F32] {
1021         e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
1022         e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
1023     }
1024 
1025     // Adjust SP down by a dynamic value (or up, with a negative operand).
1026     e.enc32(adjust_sp_down.bind(I32), rec_adjustsp.opcodes(&SUB));
1027     e.enc64(
1028         adjust_sp_down.bind(I64),
1029         rec_adjustsp.opcodes(&SUB).rex().w(),
1030     );
1031 
1032     // Adjust SP up by an immediate (or down, with a negative immediate).
1033     e.enc32(adjust_sp_up_imm, rec_adjustsp_ib.opcodes(&CMP_IMM8));
1034     e.enc32(adjust_sp_up_imm, rec_adjustsp_id.opcodes(&CMP_IMM));
1035     e.enc64(
1036         adjust_sp_up_imm,
1037         rec_adjustsp_ib.opcodes(&CMP_IMM8).rex().w(),
1038     );
1039     e.enc64(
1040         adjust_sp_up_imm,
1041         rec_adjustsp_id.opcodes(&CMP_IMM).rex().w(),
1042     );
1043 
1044     // Adjust SP down by an immediate (or up, with a negative immediate).
1045     e.enc32(
1046         adjust_sp_down_imm,
1047         rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5),
1048     );
1049     e.enc32(adjust_sp_down_imm, rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5));
1050     e.enc64(
1051         adjust_sp_down_imm,
1052         rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5).rex().w(),
1053     );
1054     e.enc64(
1055         adjust_sp_down_imm,
1056         rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5).rex().w(),
1057     );
1058 }
1059 
1060 #[inline(never)]
define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)1061 fn define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
1062     let shared = &shared_defs.instructions;
1063 
1064     // Shorthands for instructions.
1065     let bitcast = shared.by_name("bitcast");
1066     let copy = shared.by_name("copy");
1067     let regmove = shared.by_name("regmove");
1068 
1069     // Shorthands for recipes.
1070     let rec_frmov = r.template("frmov");
1071     let rec_frurm = r.template("frurm");
1072     let rec_furm = r.template("furm");
1073     let rec_rfumr = r.template("rfumr");
1074 
1075     // Floating-point moves.
1076     // movd
1077     e.enc_both(
1078         bitcast.bind(F32).bind(I32),
1079         rec_frurm.opcodes(&MOVD_LOAD_XMM),
1080     );
1081     e.enc_both(
1082         bitcast.bind(I32).bind(F32),
1083         rec_rfumr.opcodes(&MOVD_STORE_XMM),
1084     );
1085 
1086     // movq
1087     e.enc64(
1088         bitcast.bind(F64).bind(I64),
1089         rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
1090     );
1091     e.enc64(
1092         bitcast.bind(I64).bind(F64),
1093         rec_rfumr.opcodes(&MOVD_STORE_XMM).rex().w(),
1094     );
1095 
1096     // movaps
1097     e.enc_both(copy.bind(F32), rec_furm.opcodes(&MOVAPS_LOAD));
1098     e.enc_both(copy.bind(F64), rec_furm.opcodes(&MOVAPS_LOAD));
1099 
1100     // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
1101     // immediate operands with the current constraint language.
1102     e.enc32(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD));
1103     e.enc64(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
1104 
1105     // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
1106     // immediate operands with the current constraint language.
1107     e.enc32(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD));
1108     e.enc64(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
1109 }
1110 
1111 #[inline(never)]
define_fpu_memory( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup, )1112 fn define_fpu_memory(
1113     e: &mut PerCpuModeEncodings,
1114     shared_defs: &SharedDefinitions,
1115     r: &RecipeGroup,
1116 ) {
1117     let shared = &shared_defs.instructions;
1118 
1119     // Shorthands for instructions.
1120     let fill = shared.by_name("fill");
1121     let load = shared.by_name("load");
1122     let load_complex = shared.by_name("load_complex");
1123     let regfill = shared.by_name("regfill");
1124     let regspill = shared.by_name("regspill");
1125     let spill = shared.by_name("spill");
1126     let store = shared.by_name("store");
1127     let store_complex = shared.by_name("store_complex");
1128 
1129     // Shorthands for recipes.
1130     let rec_ffillSib32 = r.template("ffillSib32");
1131     let rec_fld = r.template("fld");
1132     let rec_fldDisp32 = r.template("fldDisp32");
1133     let rec_fldDisp8 = r.template("fldDisp8");
1134     let rec_fldWithIndex = r.template("fldWithIndex");
1135     let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
1136     let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
1137     let rec_fregfill32 = r.template("fregfill32");
1138     let rec_fregspill32 = r.template("fregspill32");
1139     let rec_fspillSib32 = r.template("fspillSib32");
1140     let rec_fst = r.template("fst");
1141     let rec_fstDisp32 = r.template("fstDisp32");
1142     let rec_fstDisp8 = r.template("fstDisp8");
1143     let rec_fstWithIndex = r.template("fstWithIndex");
1144     let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
1145     let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
1146 
1147     // Float loads and stores.
1148     e.enc_both(load.bind(F32).bind(Any), rec_fld.opcodes(&MOVSS_LOAD));
1149     e.enc_both(load.bind(F32).bind(Any), rec_fldDisp8.opcodes(&MOVSS_LOAD));
1150     e.enc_both(load.bind(F32).bind(Any), rec_fldDisp32.opcodes(&MOVSS_LOAD));
1151 
1152     e.enc_both(
1153         load_complex.bind(F32),
1154         rec_fldWithIndex.opcodes(&MOVSS_LOAD),
1155     );
1156     e.enc_both(
1157         load_complex.bind(F32),
1158         rec_fldWithIndexDisp8.opcodes(&MOVSS_LOAD),
1159     );
1160     e.enc_both(
1161         load_complex.bind(F32),
1162         rec_fldWithIndexDisp32.opcodes(&MOVSS_LOAD),
1163     );
1164 
1165     e.enc_both(load.bind(F64).bind(Any), rec_fld.opcodes(&MOVSD_LOAD));
1166     e.enc_both(load.bind(F64).bind(Any), rec_fldDisp8.opcodes(&MOVSD_LOAD));
1167     e.enc_both(load.bind(F64).bind(Any), rec_fldDisp32.opcodes(&MOVSD_LOAD));
1168 
1169     e.enc_both(
1170         load_complex.bind(F64),
1171         rec_fldWithIndex.opcodes(&MOVSD_LOAD),
1172     );
1173     e.enc_both(
1174         load_complex.bind(F64),
1175         rec_fldWithIndexDisp8.opcodes(&MOVSD_LOAD),
1176     );
1177     e.enc_both(
1178         load_complex.bind(F64),
1179         rec_fldWithIndexDisp32.opcodes(&MOVSD_LOAD),
1180     );
1181 
1182     e.enc_both(store.bind(F32).bind(Any), rec_fst.opcodes(&MOVSS_STORE));
1183     e.enc_both(
1184         store.bind(F32).bind(Any),
1185         rec_fstDisp8.opcodes(&MOVSS_STORE),
1186     );
1187     e.enc_both(
1188         store.bind(F32).bind(Any),
1189         rec_fstDisp32.opcodes(&MOVSS_STORE),
1190     );
1191 
1192     e.enc_both(
1193         store_complex.bind(F32),
1194         rec_fstWithIndex.opcodes(&MOVSS_STORE),
1195     );
1196     e.enc_both(
1197         store_complex.bind(F32),
1198         rec_fstWithIndexDisp8.opcodes(&MOVSS_STORE),
1199     );
1200     e.enc_both(
1201         store_complex.bind(F32),
1202         rec_fstWithIndexDisp32.opcodes(&MOVSS_STORE),
1203     );
1204 
1205     e.enc_both(store.bind(F64).bind(Any), rec_fst.opcodes(&MOVSD_STORE));
1206     e.enc_both(
1207         store.bind(F64).bind(Any),
1208         rec_fstDisp8.opcodes(&MOVSD_STORE),
1209     );
1210     e.enc_both(
1211         store.bind(F64).bind(Any),
1212         rec_fstDisp32.opcodes(&MOVSD_STORE),
1213     );
1214 
1215     e.enc_both(
1216         store_complex.bind(F64),
1217         rec_fstWithIndex.opcodes(&MOVSD_STORE),
1218     );
1219     e.enc_both(
1220         store_complex.bind(F64),
1221         rec_fstWithIndexDisp8.opcodes(&MOVSD_STORE),
1222     );
1223     e.enc_both(
1224         store_complex.bind(F64),
1225         rec_fstWithIndexDisp32.opcodes(&MOVSD_STORE),
1226     );
1227 
1228     e.enc_both(fill.bind(F32), rec_ffillSib32.opcodes(&MOVSS_LOAD));
1229     e.enc_both(regfill.bind(F32), rec_fregfill32.opcodes(&MOVSS_LOAD));
1230     e.enc_both(fill.bind(F64), rec_ffillSib32.opcodes(&MOVSD_LOAD));
1231     e.enc_both(regfill.bind(F64), rec_fregfill32.opcodes(&MOVSD_LOAD));
1232 
1233     e.enc_both(spill.bind(F32), rec_fspillSib32.opcodes(&MOVSS_STORE));
1234     e.enc_both(regspill.bind(F32), rec_fregspill32.opcodes(&MOVSS_STORE));
1235     e.enc_both(spill.bind(F64), rec_fspillSib32.opcodes(&MOVSD_STORE));
1236     e.enc_both(regspill.bind(F64), rec_fregspill32.opcodes(&MOVSD_STORE));
1237 }
1238 
1239 #[inline(never)]
define_fpu_ops( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1240 fn define_fpu_ops(
1241     e: &mut PerCpuModeEncodings,
1242     shared_defs: &SharedDefinitions,
1243     settings: &SettingGroup,
1244     x86: &InstructionGroup,
1245     r: &RecipeGroup,
1246 ) {
1247     let shared = &shared_defs.instructions;
1248     let formats = &shared_defs.formats;
1249 
1250     // Shorthands for instructions.
1251     let ceil = shared.by_name("ceil");
1252     let f32const = shared.by_name("f32const");
1253     let f64const = shared.by_name("f64const");
1254     let fadd = shared.by_name("fadd");
1255     let fcmp = shared.by_name("fcmp");
1256     let fcvt_from_sint = shared.by_name("fcvt_from_sint");
1257     let fdemote = shared.by_name("fdemote");
1258     let fdiv = shared.by_name("fdiv");
1259     let ffcmp = shared.by_name("ffcmp");
1260     let floor = shared.by_name("floor");
1261     let fmul = shared.by_name("fmul");
1262     let fpromote = shared.by_name("fpromote");
1263     let fsub = shared.by_name("fsub");
1264     let nearest = shared.by_name("nearest");
1265     let sqrt = shared.by_name("sqrt");
1266     let trunc = shared.by_name("trunc");
1267     let x86_cvtt2si = x86.by_name("x86_cvtt2si");
1268     let x86_fmax = x86.by_name("x86_fmax");
1269     let x86_fmin = x86.by_name("x86_fmin");
1270 
1271     // Shorthands for recipes.
1272     let rec_f32imm_z = r.template("f32imm_z");
1273     let rec_f64imm_z = r.template("f64imm_z");
1274     let rec_fa = r.template("fa");
1275     let rec_fcmp = r.template("fcmp");
1276     let rec_fcscc = r.template("fcscc");
1277     let rec_frurm = r.template("frurm");
1278     let rec_furm = r.template("furm");
1279     let rec_furmi_rnd = r.template("furmi_rnd");
1280     let rec_rfurm = r.template("rfurm");
1281 
1282     // Predicates shorthands.
1283     let use_sse41 = settings.predicate_by_name("use_sse41");
1284 
1285     // Floating-point constants equal to 0.0 can be encoded using either `xorps` or `xorpd`, for
1286     // 32-bit and 64-bit floats respectively.
1287     let is_zero_32_bit_float =
1288         InstructionPredicate::new_is_zero_32bit_float(&*formats.unary_ieee32, "imm");
1289     e.enc32_instp(
1290         f32const,
1291         rec_f32imm_z.opcodes(&XORPS),
1292         is_zero_32_bit_float.clone(),
1293     );
1294 
1295     let is_zero_64_bit_float =
1296         InstructionPredicate::new_is_zero_64bit_float(&*formats.unary_ieee64, "imm");
1297     e.enc32_instp(
1298         f64const,
1299         rec_f64imm_z.opcodes(&XORPD),
1300         is_zero_64_bit_float.clone(),
1301     );
1302 
1303     e.enc_x86_64_instp(f32const, rec_f32imm_z.opcodes(&XORPS), is_zero_32_bit_float);
1304     e.enc_x86_64_instp(f64const, rec_f64imm_z.opcodes(&XORPD), is_zero_64_bit_float);
1305 
1306     // cvtsi2ss
1307     e.enc_i32_i64(fcvt_from_sint.bind(F32), rec_frurm.opcodes(&CVTSI2SS));
1308 
1309     // cvtsi2sd
1310     e.enc_i32_i64(fcvt_from_sint.bind(F64), rec_frurm.opcodes(&CVTSI2SD));
1311 
1312     // cvtss2sd
1313     e.enc_both(fpromote.bind(F64).bind(F32), rec_furm.opcodes(&CVTSS2SD));
1314 
1315     // cvtsd2ss
1316     e.enc_both(fdemote.bind(F32).bind(F64), rec_furm.opcodes(&CVTSD2SS));
1317 
1318     // cvttss2si
1319     e.enc_both(
1320         x86_cvtt2si.bind(I32).bind(F32),
1321         rec_rfurm.opcodes(&CVTTSS2SI),
1322     );
1323     e.enc64(
1324         x86_cvtt2si.bind(I64).bind(F32),
1325         rec_rfurm.opcodes(&CVTTSS2SI).rex().w(),
1326     );
1327 
1328     // cvttsd2si
1329     e.enc_both(
1330         x86_cvtt2si.bind(I32).bind(F64),
1331         rec_rfurm.opcodes(&CVTTSD2SI),
1332     );
1333     e.enc64(
1334         x86_cvtt2si.bind(I64).bind(F64),
1335         rec_rfurm.opcodes(&CVTTSD2SI).rex().w(),
1336     );
1337 
1338     // Exact square roots.
1339     e.enc_both(sqrt.bind(F32), rec_furm.opcodes(&SQRTSS));
1340     e.enc_both(sqrt.bind(F64), rec_furm.opcodes(&SQRTSD));
1341 
1342     // Rounding. The recipe looks at the opcode to pick an immediate.
1343     for inst in &[nearest, floor, ceil, trunc] {
1344         e.enc_both_isap(inst.bind(F32), rec_furmi_rnd.opcodes(&ROUNDSS), use_sse41);
1345         e.enc_both_isap(inst.bind(F64), rec_furmi_rnd.opcodes(&ROUNDSD), use_sse41);
1346     }
1347 
1348     // Binary arithmetic ops.
1349     e.enc_both(fadd.bind(F32), rec_fa.opcodes(&ADDSS));
1350     e.enc_both(fadd.bind(F64), rec_fa.opcodes(&ADDSD));
1351 
1352     e.enc_both(fsub.bind(F32), rec_fa.opcodes(&SUBSS));
1353     e.enc_both(fsub.bind(F64), rec_fa.opcodes(&SUBSD));
1354 
1355     e.enc_both(fmul.bind(F32), rec_fa.opcodes(&MULSS));
1356     e.enc_both(fmul.bind(F64), rec_fa.opcodes(&MULSD));
1357 
1358     e.enc_both(fdiv.bind(F32), rec_fa.opcodes(&DIVSS));
1359     e.enc_both(fdiv.bind(F64), rec_fa.opcodes(&DIVSD));
1360 
1361     e.enc_both(x86_fmin.bind(F32), rec_fa.opcodes(&MINSS));
1362     e.enc_both(x86_fmin.bind(F64), rec_fa.opcodes(&MINSD));
1363 
1364     e.enc_both(x86_fmax.bind(F32), rec_fa.opcodes(&MAXSS));
1365     e.enc_both(x86_fmax.bind(F64), rec_fa.opcodes(&MAXSD));
1366 
1367     // Comparisons.
1368     //
1369     // This only covers the condition codes in `supported_floatccs`, the rest are
1370     // handled by legalization patterns.
1371     e.enc_both(fcmp.bind(F32), rec_fcscc.opcodes(&UCOMISS));
1372     e.enc_both(fcmp.bind(F64), rec_fcscc.opcodes(&UCOMISD));
1373     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(&UCOMISS));
1374     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(&UCOMISD));
1375 }
1376 
1377 #[inline(never)]
define_alu( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1378 fn define_alu(
1379     e: &mut PerCpuModeEncodings,
1380     shared_defs: &SharedDefinitions,
1381     settings: &SettingGroup,
1382     x86: &InstructionGroup,
1383     r: &RecipeGroup,
1384 ) {
1385     let shared = &shared_defs.instructions;
1386 
1387     // Shorthands for instructions.
1388     let clz = shared.by_name("clz");
1389     let ctz = shared.by_name("ctz");
1390     let icmp = shared.by_name("icmp");
1391     let icmp_imm = shared.by_name("icmp_imm");
1392     let ifcmp = shared.by_name("ifcmp");
1393     let ifcmp_imm = shared.by_name("ifcmp_imm");
1394     let ifcmp_sp = shared.by_name("ifcmp_sp");
1395     let ishl = shared.by_name("ishl");
1396     let ishl_imm = shared.by_name("ishl_imm");
1397     let popcnt = shared.by_name("popcnt");
1398     let rotl = shared.by_name("rotl");
1399     let rotl_imm = shared.by_name("rotl_imm");
1400     let rotr = shared.by_name("rotr");
1401     let rotr_imm = shared.by_name("rotr_imm");
1402     let selectif = shared.by_name("selectif");
1403     let selectif_spectre_guard = shared.by_name("selectif_spectre_guard");
1404     let sshr = shared.by_name("sshr");
1405     let sshr_imm = shared.by_name("sshr_imm");
1406     let trueff = shared.by_name("trueff");
1407     let trueif = shared.by_name("trueif");
1408     let ushr = shared.by_name("ushr");
1409     let ushr_imm = shared.by_name("ushr_imm");
1410     let x86_bsf = x86.by_name("x86_bsf");
1411     let x86_bsr = x86.by_name("x86_bsr");
1412 
1413     // Shorthands for recipes.
1414     let rec_bsf_and_bsr = r.template("bsf_and_bsr");
1415     let rec_cmov = r.template("cmov");
1416     let rec_icscc = r.template("icscc");
1417     let rec_icscc_ib = r.template("icscc_ib");
1418     let rec_icscc_id = r.template("icscc_id");
1419     let rec_rcmp = r.template("rcmp");
1420     let rec_rcmp_ib = r.template("rcmp_ib");
1421     let rec_rcmp_id = r.template("rcmp_id");
1422     let rec_rcmp_sp = r.template("rcmp_sp");
1423     let rec_rc = r.template("rc");
1424     let rec_setf_abcd = r.template("setf_abcd");
1425     let rec_seti_abcd = r.template("seti_abcd");
1426     let rec_urm = r.template("urm");
1427 
1428     // Predicates shorthands.
1429     let use_popcnt = settings.predicate_by_name("use_popcnt");
1430     let use_lzcnt = settings.predicate_by_name("use_lzcnt");
1431     let use_bmi1 = settings.predicate_by_name("use_bmi1");
1432 
1433     let band = shared.by_name("band");
1434     let band_imm = shared.by_name("band_imm");
1435     let band_not = shared.by_name("band_not");
1436     let bnot = shared.by_name("bnot");
1437     let bor = shared.by_name("bor");
1438     let bor_imm = shared.by_name("bor_imm");
1439     let bxor = shared.by_name("bxor");
1440     let bxor_imm = shared.by_name("bxor_imm");
1441     let iadd = shared.by_name("iadd");
1442     let iadd_ifcarry = shared.by_name("iadd_ifcarry");
1443     let iadd_ifcin = shared.by_name("iadd_ifcin");
1444     let iadd_ifcout = shared.by_name("iadd_ifcout");
1445     let iadd_imm = shared.by_name("iadd_imm");
1446     let imul = shared.by_name("imul");
1447     let isub = shared.by_name("isub");
1448     let isub_ifbin = shared.by_name("isub_ifbin");
1449     let isub_ifborrow = shared.by_name("isub_ifborrow");
1450     let isub_ifbout = shared.by_name("isub_ifbout");
1451     let x86_sdivmodx = x86.by_name("x86_sdivmodx");
1452     let x86_smulx = x86.by_name("x86_smulx");
1453     let x86_udivmodx = x86.by_name("x86_udivmodx");
1454     let x86_umulx = x86.by_name("x86_umulx");
1455 
1456     let rec_div = r.template("div");
1457     let rec_fa = r.template("fa");
1458     let rec_fax = r.template("fax");
1459     let rec_mulx = r.template("mulx");
1460     let rec_r_ib = r.template("r_ib");
1461     let rec_r_id = r.template("r_id");
1462     let rec_rin = r.template("rin");
1463     let rec_rio = r.template("rio");
1464     let rec_rout = r.template("rout");
1465     let rec_rr = r.template("rr");
1466     let rec_rrx = r.template("rrx");
1467     let rec_ur = r.template("ur");
1468 
1469     e.enc_i32_i64(iadd, rec_rr.opcodes(&ADD));
1470     e.enc_i32_i64(iadd_ifcout, rec_rout.opcodes(&ADD));
1471     e.enc_i32_i64(iadd_ifcin, rec_rin.opcodes(&ADC));
1472     e.enc_i32_i64(iadd_ifcarry, rec_rio.opcodes(&ADC));
1473     e.enc_i32_i64(iadd_imm, rec_r_ib.opcodes(&ADD_IMM8_SIGN_EXTEND).rrr(0));
1474     e.enc_i32_i64(iadd_imm, rec_r_id.opcodes(&ADD_IMM).rrr(0));
1475 
1476     e.enc_i32_i64(isub, rec_rr.opcodes(&SUB));
1477     e.enc_i32_i64(isub_ifbout, rec_rout.opcodes(&SUB));
1478     e.enc_i32_i64(isub_ifbin, rec_rin.opcodes(&SBB));
1479     e.enc_i32_i64(isub_ifborrow, rec_rio.opcodes(&SBB));
1480 
1481     e.enc_i32_i64(band, rec_rr.opcodes(&AND));
1482     e.enc_b32_b64(band, rec_rr.opcodes(&AND));
1483 
1484     // TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as band_imm.i32. Can
1485     // even use the single-byte immediate for 0xffff_ffXX masks.
1486 
1487     e.enc_i32_i64(band_imm, rec_r_ib.opcodes(&AND_IMM8_SIGN_EXTEND).rrr(4));
1488     e.enc_i32_i64(band_imm, rec_r_id.opcodes(&AND_IMM).rrr(4));
1489 
1490     e.enc_i32_i64(bor, rec_rr.opcodes(&OR));
1491     e.enc_b32_b64(bor, rec_rr.opcodes(&OR));
1492     e.enc_i32_i64(bor_imm, rec_r_ib.opcodes(&OR_IMM8_SIGN_EXTEND).rrr(1));
1493     e.enc_i32_i64(bor_imm, rec_r_id.opcodes(&OR_IMM).rrr(1));
1494 
1495     e.enc_i32_i64(bxor, rec_rr.opcodes(&XOR));
1496     e.enc_b32_b64(bxor, rec_rr.opcodes(&XOR));
1497     e.enc_i32_i64(bxor_imm, rec_r_ib.opcodes(&XOR_IMM8_SIGN_EXTEND).rrr(6));
1498     e.enc_i32_i64(bxor_imm, rec_r_id.opcodes(&XOR_IMM).rrr(6));
1499 
1500     // x86 has a bitwise not instruction NOT.
1501     e.enc_i32_i64(bnot, rec_ur.opcodes(&NOT).rrr(2));
1502     e.enc_b32_b64(bnot, rec_ur.opcodes(&NOT).rrr(2));
1503     e.enc_both(bnot.bind(B1), rec_ur.opcodes(&NOT).rrr(2));
1504 
1505     // Also add a `b1` encodings for the logic instructions.
1506     // TODO: Should this be done with 8-bit instructions? It would improve partial register
1507     // dependencies.
1508     e.enc_both(band.bind(B1), rec_rr.opcodes(&AND));
1509     e.enc_both(bor.bind(B1), rec_rr.opcodes(&OR));
1510     e.enc_both(bxor.bind(B1), rec_rr.opcodes(&XOR));
1511 
1512     e.enc_i32_i64(imul, rec_rrx.opcodes(&IMUL));
1513     e.enc_i32_i64(x86_sdivmodx, rec_div.opcodes(&IDIV).rrr(7));
1514     e.enc_i32_i64(x86_udivmodx, rec_div.opcodes(&DIV).rrr(6));
1515 
1516     e.enc_i32_i64(x86_smulx, rec_mulx.opcodes(&IMUL_RDX_RAX).rrr(5));
1517     e.enc_i32_i64(x86_umulx, rec_mulx.opcodes(&MUL).rrr(4));
1518 
1519     // Binary bitwise ops.
1520     //
1521     // The F64 version is intentionally encoded using the single-precision opcode:
1522     // the operation is identical and the encoding is one byte shorter.
1523     e.enc_both(band.bind(F32), rec_fa.opcodes(&ANDPS));
1524     e.enc_both(band.bind(F64), rec_fa.opcodes(&ANDPS));
1525 
1526     e.enc_both(bor.bind(F32), rec_fa.opcodes(&ORPS));
1527     e.enc_both(bor.bind(F64), rec_fa.opcodes(&ORPS));
1528 
1529     e.enc_both(bxor.bind(F32), rec_fa.opcodes(&XORPS));
1530     e.enc_both(bxor.bind(F64), rec_fa.opcodes(&XORPS));
1531 
1532     // The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y.
1533     e.enc_both(band_not.bind(F32), rec_fax.opcodes(&ANDNPS));
1534     e.enc_both(band_not.bind(F64), rec_fax.opcodes(&ANDNPS));
1535 
1536     // Shifts and rotates.
1537     // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
1538     // and 16-bit shifts would need explicit masking.
1539 
1540     for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
1541         // Cannot use enc_i32_i64 for this pattern because instructions require
1542         // to bind any.
1543         e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
1544         e.enc32(
1545             inst.bind(I32).bind(I16),
1546             rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1547         );
1548         e.enc32(
1549             inst.bind(I32).bind(I32),
1550             rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1551         );
1552         e.enc64(
1553             inst.bind(I64).bind(Any),
1554             rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex().w(),
1555         );
1556         e.enc64(
1557             inst.bind(I32).bind(Any),
1558             rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex(),
1559         );
1560         e.enc64(
1561             inst.bind(I32).bind(Any),
1562             rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1563         );
1564     }
1565 
1566     e.enc_i32_i64(rotl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(0));
1567     e.enc_i32_i64(rotr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(1));
1568     e.enc_i32_i64(ishl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(4));
1569     e.enc_i32_i64(ushr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(5));
1570     e.enc_i32_i64(sshr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(7));
1571 
1572     // Population count.
1573     e.enc32_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
1574     e.enc64_isap(
1575         popcnt.bind(I64),
1576         rec_urm.opcodes(&POPCNT).rex().w(),
1577         use_popcnt,
1578     );
1579     e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT).rex(), use_popcnt);
1580     e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
1581 
1582     // Count leading zero bits.
1583     e.enc32_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
1584     e.enc64_isap(clz.bind(I64), rec_urm.opcodes(&LZCNT).rex().w(), use_lzcnt);
1585     e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT).rex(), use_lzcnt);
1586     e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
1587 
1588     // Count trailing zero bits.
1589     e.enc32_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
1590     e.enc64_isap(ctz.bind(I64), rec_urm.opcodes(&TZCNT).rex().w(), use_bmi1);
1591     e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT).rex(), use_bmi1);
1592     e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
1593 
1594     // Bit scan forwards and reverse
1595     e.enc_i32_i64(x86_bsf, rec_bsf_and_bsr.opcodes(&BIT_SCAN_FORWARD));
1596     e.enc_i32_i64(x86_bsr, rec_bsf_and_bsr.opcodes(&BIT_SCAN_REVERSE));
1597 
1598     // Comparisons
1599     e.enc_i32_i64(icmp, rec_icscc.opcodes(&CMP_REG));
1600     e.enc_i32_i64(icmp_imm, rec_icscc_ib.opcodes(&CMP_IMM8).rrr(7));
1601     e.enc_i32_i64(icmp_imm, rec_icscc_id.opcodes(&CMP_IMM).rrr(7));
1602     e.enc_i32_i64(ifcmp, rec_rcmp.opcodes(&CMP_REG));
1603     e.enc_i32_i64(ifcmp_imm, rec_rcmp_ib.opcodes(&CMP_IMM8).rrr(7));
1604     e.enc_i32_i64(ifcmp_imm, rec_rcmp_id.opcodes(&CMP_IMM).rrr(7));
1605     // TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x).
1606 
1607     e.enc32(ifcmp_sp.bind(I32), rec_rcmp_sp.opcodes(&CMP_REG));
1608     e.enc64(ifcmp_sp.bind(I64), rec_rcmp_sp.opcodes(&CMP_REG).rex().w());
1609 
1610     // Convert flags to bool.
1611     // This encodes `b1` as an 8-bit low register with the value 0 or 1.
1612     e.enc_both(trueif, rec_seti_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
1613     e.enc_both(trueff, rec_setf_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
1614 
1615     // Conditional move (a.k.a integer select).
1616     e.enc_i32_i64(selectif, rec_cmov.opcodes(&CMOV_OVERFLOW));
1617     // A Spectre-guard integer select is exactly the same as a selectif, but
1618     // is not associated with any other legalization rules and is not
1619     // recognized by any optimizations, so it must arrive here unmodified
1620     // and in its original place.
1621     e.enc_i32_i64(selectif_spectre_guard, rec_cmov.opcodes(&CMOV_OVERFLOW));
1622 }
1623 
1624 #[inline(never)]
1625 #[allow(clippy::cognitive_complexity)]
define_simd( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1626 fn define_simd(
1627     e: &mut PerCpuModeEncodings,
1628     shared_defs: &SharedDefinitions,
1629     settings: &SettingGroup,
1630     x86: &InstructionGroup,
1631     r: &RecipeGroup,
1632 ) {
1633     let shared = &shared_defs.instructions;
1634     let formats = &shared_defs.formats;
1635 
1636     // Shorthands for instructions.
1637     let avg_round = shared.by_name("avg_round");
1638     let bitcast = shared.by_name("bitcast");
1639     let bor = shared.by_name("bor");
1640     let bxor = shared.by_name("bxor");
1641     let copy = shared.by_name("copy");
1642     let copy_nop = shared.by_name("copy_nop");
1643     let copy_to_ssa = shared.by_name("copy_to_ssa");
1644     let fadd = shared.by_name("fadd");
1645     let fcmp = shared.by_name("fcmp");
1646     let fcvt_from_sint = shared.by_name("fcvt_from_sint");
1647     let fdiv = shared.by_name("fdiv");
1648     let fill = shared.by_name("fill");
1649     let fill_nop = shared.by_name("fill_nop");
1650     let fmul = shared.by_name("fmul");
1651     let fsub = shared.by_name("fsub");
1652     let iabs = shared.by_name("iabs");
1653     let iadd = shared.by_name("iadd");
1654     let icmp = shared.by_name("icmp");
1655     let imul = shared.by_name("imul");
1656     let ishl_imm = shared.by_name("ishl_imm");
1657     let load = shared.by_name("load");
1658     let load_complex = shared.by_name("load_complex");
1659     let raw_bitcast = shared.by_name("raw_bitcast");
1660     let regfill = shared.by_name("regfill");
1661     let regmove = shared.by_name("regmove");
1662     let regspill = shared.by_name("regspill");
1663     let sadd_sat = shared.by_name("sadd_sat");
1664     let scalar_to_vector = shared.by_name("scalar_to_vector");
1665     let sload8x8 = shared.by_name("sload8x8");
1666     let sload8x8_complex = shared.by_name("sload8x8_complex");
1667     let sload16x4 = shared.by_name("sload16x4");
1668     let sload16x4_complex = shared.by_name("sload16x4_complex");
1669     let sload32x2 = shared.by_name("sload32x2");
1670     let sload32x2_complex = shared.by_name("sload32x2_complex");
1671     let spill = shared.by_name("spill");
1672     let sqrt = shared.by_name("sqrt");
1673     let sshr_imm = shared.by_name("sshr_imm");
1674     let ssub_sat = shared.by_name("ssub_sat");
1675     let store = shared.by_name("store");
1676     let store_complex = shared.by_name("store_complex");
1677     let swiden_low = shared.by_name("swiden_low");
1678     let uadd_sat = shared.by_name("uadd_sat");
1679     let uload8x8 = shared.by_name("uload8x8");
1680     let uload8x8_complex = shared.by_name("uload8x8_complex");
1681     let uload16x4 = shared.by_name("uload16x4");
1682     let uload16x4_complex = shared.by_name("uload16x4_complex");
1683     let uload32x2 = shared.by_name("uload32x2");
1684     let uload32x2_complex = shared.by_name("uload32x2_complex");
1685     let snarrow = shared.by_name("snarrow");
1686     let unarrow = shared.by_name("unarrow");
1687     let uwiden_low = shared.by_name("uwiden_low");
1688     let ushr_imm = shared.by_name("ushr_imm");
1689     let usub_sat = shared.by_name("usub_sat");
1690     let vconst = shared.by_name("vconst");
1691     let vselect = shared.by_name("vselect");
1692     let widening_pairwise_dot_product_s = shared.by_name("widening_pairwise_dot_product_s");
1693     let x86_cvtt2si = x86.by_name("x86_cvtt2si");
1694     let x86_insertps = x86.by_name("x86_insertps");
1695     let x86_fmax = x86.by_name("x86_fmax");
1696     let x86_fmin = x86.by_name("x86_fmin");
1697     let x86_movlhps = x86.by_name("x86_movlhps");
1698     let x86_movsd = x86.by_name("x86_movsd");
1699     let x86_pblendw = x86.by_name("x86_pblendw");
1700     let x86_pextr = x86.by_name("x86_pextr");
1701     let x86_pinsr = x86.by_name("x86_pinsr");
1702     let x86_pmaxs = x86.by_name("x86_pmaxs");
1703     let x86_pmaxu = x86.by_name("x86_pmaxu");
1704     let x86_pmins = x86.by_name("x86_pmins");
1705     let x86_pminu = x86.by_name("x86_pminu");
1706     let x86_pmullq = x86.by_name("x86_pmullq");
1707     let x86_pmuludq = x86.by_name("x86_pmuludq");
1708     let x86_palignr = x86.by_name("x86_palignr");
1709     let x86_pshufb = x86.by_name("x86_pshufb");
1710     let x86_pshufd = x86.by_name("x86_pshufd");
1711     let x86_psll = x86.by_name("x86_psll");
1712     let x86_psra = x86.by_name("x86_psra");
1713     let x86_psrl = x86.by_name("x86_psrl");
1714     let x86_ptest = x86.by_name("x86_ptest");
1715     let x86_punpckh = x86.by_name("x86_punpckh");
1716     let x86_punpckl = x86.by_name("x86_punpckl");
1717     let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");
1718 
1719     // Shorthands for recipes.
1720     let rec_blend = r.template("blend");
1721     let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
1722     let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
1723     let rec_f_ib = r.template("f_ib");
1724     let rec_fa = r.template("fa");
1725     let rec_fa_ib = r.template("fa_ib");
1726     let rec_fax = r.template("fax");
1727     let rec_fcmp = r.template("fcmp");
1728     let rec_ffillSib32 = r.template("ffillSib32");
1729     let rec_ffillnull = r.recipe("ffillnull");
1730     let rec_fld = r.template("fld");
1731     let rec_fldDisp32 = r.template("fldDisp32");
1732     let rec_fldDisp8 = r.template("fldDisp8");
1733     let rec_fldWithIndex = r.template("fldWithIndex");
1734     let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
1735     let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
1736     let rec_fregfill32 = r.template("fregfill32");
1737     let rec_fregspill32 = r.template("fregspill32");
1738     let rec_frmov = r.template("frmov");
1739     let rec_frurm = r.template("frurm");
1740     let rec_fspillSib32 = r.template("fspillSib32");
1741     let rec_fst = r.template("fst");
1742     let rec_fstDisp32 = r.template("fstDisp32");
1743     let rec_fstDisp8 = r.template("fstDisp8");
1744     let rec_fstWithIndex = r.template("fstWithIndex");
1745     let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
1746     let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
1747     let rec_furm = r.template("furm");
1748     let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
1749     let rec_icscc_fpr = r.template("icscc_fpr");
1750     let rec_null_fpr = r.recipe("null_fpr");
1751     let rec_pfcmp = r.template("pfcmp");
1752     let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr");
1753     let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr");
1754     let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
1755     let rec_stacknull = r.recipe("stacknull");
1756     let rec_vconst = r.template("vconst");
1757     let rec_vconst_optimized = r.template("vconst_optimized");
1758 
1759     // Predicates shorthands.
1760     settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
1761     settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
1762     let use_ssse3_simd = settings.predicate_by_name("use_ssse3_simd");
1763     let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
1764     let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
1765     let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
1766     let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");
1767 
1768     // SIMD vector size: eventually multiple vector sizes may be supported but for now only
1769     // SSE-sized vectors are available.
1770     let sse_vector_size: u64 = 128;
1771 
1772     // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
1773     // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
1774     // value across the register.
1775 
1776     let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
1777 
1778     // PSHUFB, 8-bit shuffle using two XMM registers.
1779     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1780         let instruction = x86_pshufb.bind(vector(ty, sse_vector_size));
1781         let template = rec_fa.opcodes(&PSHUFB);
1782         e.enc_both_inferred_maybe_isap(instruction.clone(), template.clone(), Some(use_ssse3_simd));
1783     }
1784 
1785     // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
1786     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
1787         let instruction = x86_pshufd.bind(vector(ty, sse_vector_size));
1788         let template = rec_r_ib_unsigned_fpr.opcodes(&PSHUFD);
1789         e.enc_both_inferred(instruction, template);
1790     }
1791 
1792     // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
1793     // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
1794     // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
1795     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1796         let opcode = match ty.lane_bits() {
1797             32 => &BLENDVPS,
1798             64 => &BLENDVPD,
1799             _ => &PBLENDVB,
1800         };
1801         let instruction = vselect.bind(vector(ty, sse_vector_size));
1802         let template = rec_blend.opcodes(opcode);
1803         e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1804     }
1805 
1806     // PBLENDW, select lanes using a u8 immediate.
1807     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
1808         let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
1809         let template = rec_fa_ib.opcodes(&PBLENDW);
1810         e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1811     }
1812 
1813     // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
1814     // to the Intel manual: "When the destination operand is an XMM register, the source operand is
1815     // written to the low doubleword of the register and the register is zero-extended to 128 bits."
1816     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1817         let instruction = scalar_to_vector.bind(vector(ty, sse_vector_size));
1818         if ty.is_float() {
1819             // No need to move floats--they already live in XMM registers.
1820             e.enc_32_64_rec(instruction, rec_null_fpr, 0);
1821         } else {
1822             let template = rec_frurm.opcodes(&MOVD_LOAD_XMM);
1823             if ty.lane_bits() < 64 {
1824                 e.enc_both_inferred(instruction, template);
1825             } else {
1826                 // No 32-bit encodings for 64-bit widths.
1827                 assert_eq!(ty.lane_bits(), 64);
1828                 e.enc64(instruction, template.rex().w());
1829             }
1830         }
1831     }
1832 
1833     // SIMD insertlane
1834     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1835         let (opcode, isap): (&[_], _) = match ty.lane_bits() {
1836             8 => (&PINSRB, Some(use_sse41_simd)),
1837             16 => (&PINSRW, None),
1838             32 | 64 => (&PINSR, Some(use_sse41_simd)),
1839             _ => panic!("invalid size for SIMD insertlane"),
1840         };
1841 
1842         let instruction = x86_pinsr.bind(vector(ty, sse_vector_size));
1843         let template = rec_r_ib_unsigned_r.opcodes(opcode);
1844         if ty.lane_bits() < 64 {
1845             e.enc_both_inferred_maybe_isap(instruction, template, isap);
1846         } else {
1847             // It turns out the 64-bit widths have REX/W encodings and only are available on
1848             // x86_64.
1849             e.enc64_maybe_isap(instruction, template.rex().w(), isap);
1850         }
1851     }
1852 
1853     // For legalizing insertlane with floats, INSERTPS from SSE4.1.
1854     {
1855         let instruction = x86_insertps.bind(vector(F32, sse_vector_size));
1856         let template = rec_fa_ib.opcodes(&INSERTPS);
1857         e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1858     }
1859 
1860     // For legalizing insertlane with floats,  MOVSD from SSE2.
1861     {
1862         let instruction = x86_movsd.bind(vector(F64, sse_vector_size));
1863         let template = rec_fa.opcodes(&MOVSD_LOAD);
1864         e.enc_both_inferred(instruction, template); // from SSE2
1865     }
1866 
1867     // For legalizing insertlane with floats, MOVLHPS from SSE.
1868     {
1869         let instruction = x86_movlhps.bind(vector(F64, sse_vector_size));
1870         let template = rec_fa.opcodes(&MOVLHPS);
1871         e.enc_both_inferred(instruction, template); // from SSE
1872     }
1873 
1874     // SIMD extractlane
1875     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1876         let opcode = match ty.lane_bits() {
1877             8 => &PEXTRB,
1878             16 => &PEXTRW,
1879             32 | 64 => &PEXTR,
1880             _ => panic!("invalid size for SIMD extractlane"),
1881         };
1882 
1883         let instruction = x86_pextr.bind(vector(ty, sse_vector_size));
1884         let template = rec_r_ib_unsigned_gpr.opcodes(opcode);
1885         if ty.lane_bits() < 64 {
1886             e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1887         } else {
1888             // It turns out the 64-bit widths have REX/W encodings and only are available on
1889             // x86_64.
1890             e.enc64_maybe_isap(instruction, template.rex().w(), Some(use_sse41_simd));
1891         }
1892     }
1893 
1894     // SIMD packing/unpacking
1895     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1896         let (high, low) = match ty.lane_bits() {
1897             8 => (&PUNPCKHBW, &PUNPCKLBW),
1898             16 => (&PUNPCKHWD, &PUNPCKLWD),
1899             32 => (&PUNPCKHDQ, &PUNPCKLDQ),
1900             64 => (&PUNPCKHQDQ, &PUNPCKLQDQ),
1901             _ => panic!("invalid size for SIMD packing/unpacking"),
1902         };
1903 
1904         e.enc_both_inferred(
1905             x86_punpckh.bind(vector(ty, sse_vector_size)),
1906             rec_fa.opcodes(high),
1907         );
1908         e.enc_both_inferred(
1909             x86_punpckl.bind(vector(ty, sse_vector_size)),
1910             rec_fa.opcodes(low),
1911         );
1912     }
1913 
1914     // SIMD narrow/widen
1915     for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
1916         let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
1917         e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
1918     }
1919     for (ty, opcodes, isap) in &[
1920         (I16, &PACKUSWB[..], None),
1921         (I32, &PACKUSDW[..], Some(use_sse41_simd)),
1922     ] {
1923         let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
1924         e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
1925     }
1926     for (ty, swiden_opcode, uwiden_opcode) in &[
1927         (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
1928         (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
1929     ] {
1930         let isap = Some(use_sse41_simd);
1931         let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
1932         e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
1933         let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
1934         e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
1935     }
1936     for ty in &[I8, I16, I32, I64] {
1937         e.enc_both_inferred_maybe_isap(
1938             x86_palignr.bind(vector(*ty, sse_vector_size)),
1939             rec_fa_ib.opcodes(&PALIGNR[..]),
1940             Some(use_ssse3_simd),
1941         );
1942     }
1943 
1944     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
1945     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
1946         for to_type in
1947             ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
1948         {
1949             let instruction = raw_bitcast
1950                 .bind(vector(to_type, sse_vector_size))
1951                 .bind(vector(from_type, sse_vector_size));
1952             e.enc_32_64_rec(instruction, rec_null_fpr, 0);
1953         }
1954     }
1955 
1956     // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
1957     // XMM register.
1958     for float_type in &[F32, F64] {
1959         for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
1960             e.enc_32_64_rec(
1961                 raw_bitcast
1962                     .bind(vector(lane_type, sse_vector_size))
1963                     .bind(*float_type),
1964                 rec_null_fpr,
1965                 0,
1966             );
1967             e.enc_32_64_rec(
1968                 raw_bitcast
1969                     .bind(*float_type)
1970                     .bind(vector(lane_type, sse_vector_size)),
1971                 rec_null_fpr,
1972                 0,
1973             );
1974         }
1975     }
1976 
1977     // SIMD conversions
1978     {
1979         let fcvt_from_sint_32 = fcvt_from_sint
1980             .bind(vector(F32, sse_vector_size))
1981             .bind(vector(I32, sse_vector_size));
1982         e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));
1983 
1984         e.enc_32_64_maybe_isap(
1985             x86_vcvtudq2ps,
1986             rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
1987             Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
1988         );
1989 
1990         e.enc_both_inferred(
1991             x86_cvtt2si
1992                 .bind(vector(I32, sse_vector_size))
1993                 .bind(vector(F32, sse_vector_size)),
1994             rec_furm.opcodes(&CVTTPS2DQ),
1995         );
1996     }
1997 
1998     // SIMD vconst for special cases (all zeroes, all ones)
1999     // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
2000     // encoding first
2001     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2002         let instruction = vconst.bind(vector(ty, sse_vector_size));
2003 
2004         let is_zero_128bit =
2005             InstructionPredicate::new_is_all_zeroes(&*formats.unary_const, "constant_handle");
2006         let template = rec_vconst_optimized.opcodes(&PXOR).infer_rex();
2007         e.enc_32_64_func(instruction.clone(), template, |builder| {
2008             builder.inst_predicate(is_zero_128bit)
2009         });
2010 
2011         let is_ones_128bit =
2012             InstructionPredicate::new_is_all_ones(&*formats.unary_const, "constant_handle");
2013         let template = rec_vconst_optimized.opcodes(&PCMPEQB).infer_rex();
2014         e.enc_32_64_func(instruction, template, |builder| {
2015             builder.inst_predicate(is_ones_128bit)
2016         });
2017     }
2018 
2019     // SIMD vconst using MOVUPS
2020     // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
2021     // to guarantee that the constants are aligned when emitted and there is currently no mechanism
2022     // for that; alternately, constants could be loaded into XMM registers using a sequence like:
2023     // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored
2024     // in memory) but some performance measurements are needed.
2025     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2026         let instruction = vconst.bind(vector(ty, sse_vector_size));
2027         let template = rec_vconst.opcodes(&MOVUPS_LOAD);
2028         e.enc_both_inferred(instruction, template); // from SSE
2029     }
2030 
2031     // SIMD register movement: store, load, spill, fill, regmove, etc. All of these use encodings of
2032     // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have
2033     // alignment or type-specific encodings, see https://github.com/bytecodealliance/wasmtime/issues/1124).
2034     // Also, it would be ideal to infer REX prefixes for all of these instructions but for the
2035     // time being only instructions with common recipes have `infer_rex()` support.
2036     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2037         // Store
2038         let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any);
2039         e.enc_both_inferred(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE));
2040         e.enc_both_inferred(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE));
2041         e.enc_both_inferred(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE));
2042 
2043         // Store complex
2044         let bound_store_complex = store_complex.bind(vector(ty, sse_vector_size));
2045         e.enc_both(
2046             bound_store_complex.clone(),
2047             rec_fstWithIndex.opcodes(&MOVUPS_STORE),
2048         );
2049         e.enc_both(
2050             bound_store_complex.clone(),
2051             rec_fstWithIndexDisp8.opcodes(&MOVUPS_STORE),
2052         );
2053         e.enc_both(
2054             bound_store_complex,
2055             rec_fstWithIndexDisp32.opcodes(&MOVUPS_STORE),
2056         );
2057 
2058         // Load
2059         let bound_load = load.bind(vector(ty, sse_vector_size)).bind(Any);
2060         e.enc_both_inferred(bound_load.clone(), rec_fld.opcodes(&MOVUPS_LOAD));
2061         e.enc_both_inferred(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD));
2062         e.enc_both_inferred(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD));
2063 
2064         // Load complex
2065         let bound_load_complex = load_complex.bind(vector(ty, sse_vector_size));
2066         e.enc_both(
2067             bound_load_complex.clone(),
2068             rec_fldWithIndex.opcodes(&MOVUPS_LOAD),
2069         );
2070         e.enc_both(
2071             bound_load_complex.clone(),
2072             rec_fldWithIndexDisp8.opcodes(&MOVUPS_LOAD),
2073         );
2074         e.enc_both(
2075             bound_load_complex,
2076             rec_fldWithIndexDisp32.opcodes(&MOVUPS_LOAD),
2077         );
2078 
2079         // Spill
2080         let bound_spill = spill.bind(vector(ty, sse_vector_size));
2081         e.enc_both(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE));
2082         let bound_regspill = regspill.bind(vector(ty, sse_vector_size));
2083         e.enc_both(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE));
2084 
2085         // Fill
2086         let bound_fill = fill.bind(vector(ty, sse_vector_size));
2087         e.enc_both(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD));
2088         let bound_regfill = regfill.bind(vector(ty, sse_vector_size));
2089         e.enc_both(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD));
2090         let bound_fill_nop = fill_nop.bind(vector(ty, sse_vector_size));
2091         e.enc_32_64_rec(bound_fill_nop, rec_ffillnull, 0);
2092 
2093         // Regmove
2094         let bound_regmove = regmove.bind(vector(ty, sse_vector_size));
2095         e.enc_both(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD));
2096 
2097         // Copy
2098         let bound_copy = copy.bind(vector(ty, sse_vector_size));
2099         e.enc_both(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD));
2100         let bound_copy_to_ssa = copy_to_ssa.bind(vector(ty, sse_vector_size));
2101         e.enc_both(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD));
2102         let bound_copy_nop = copy_nop.bind(vector(ty, sse_vector_size));
2103         e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
2104     }
2105 
2106     // SIMD load extend
2107     for (inst, opcodes) in &[
2108         (uload8x8, &PMOVZXBW),
2109         (uload16x4, &PMOVZXWD),
2110         (uload32x2, &PMOVZXDQ),
2111         (sload8x8, &PMOVSXBW),
2112         (sload16x4, &PMOVSXWD),
2113         (sload32x2, &PMOVSXDQ),
2114     ] {
2115         let isap = Some(use_sse41_simd);
2116         for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
2117             let inst = *inst;
2118             let template = recipe.opcodes(*opcodes);
2119             e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
2120             e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap);
2121         }
2122     }
2123 
2124     // SIMD load extend (complex addressing)
2125     let is_load_complex_length_two =
2126         InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
2127     for (inst, opcodes) in &[
2128         (uload8x8_complex, &PMOVZXBW),
2129         (uload16x4_complex, &PMOVZXWD),
2130         (uload32x2_complex, &PMOVZXDQ),
2131         (sload8x8_complex, &PMOVSXBW),
2132         (sload16x4_complex, &PMOVSXWD),
2133         (sload32x2_complex, &PMOVSXDQ),
2134     ] {
2135         for recipe in &[
2136             rec_fldWithIndex,
2137             rec_fldWithIndexDisp8,
2138             rec_fldWithIndexDisp32,
2139         ] {
2140             let template = recipe.opcodes(*opcodes);
2141             let predicate = |encoding: EncodingBuilder| {
2142                 encoding
2143                     .isa_predicate(use_sse41_simd)
2144                     .inst_predicate(is_load_complex_length_two.clone())
2145             };
2146             e.enc32_func(inst.clone(), template.clone(), predicate);
2147             // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64.
2148             e.enc64_func(inst.clone(), template.rex(), predicate);
2149             e.enc64_func(inst.clone(), template, predicate);
2150         }
2151     }
2152 
2153     // SIMD integer addition
2154     for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
2155         let iadd = iadd.bind(vector(*ty, sse_vector_size));
2156         e.enc_both_inferred(iadd, rec_fa.opcodes(*opcodes));
2157     }
2158 
2159     // SIMD integer saturating addition
2160     e.enc_both_inferred(
2161         sadd_sat.bind(vector(I8, sse_vector_size)),
2162         rec_fa.opcodes(&PADDSB),
2163     );
2164     e.enc_both_inferred(
2165         sadd_sat.bind(vector(I16, sse_vector_size)),
2166         rec_fa.opcodes(&PADDSW),
2167     );
2168     e.enc_both_inferred(
2169         uadd_sat.bind(vector(I8, sse_vector_size)),
2170         rec_fa.opcodes(&PADDUSB),
2171     );
2172     e.enc_both_inferred(
2173         uadd_sat.bind(vector(I16, sse_vector_size)),
2174         rec_fa.opcodes(&PADDUSW),
2175     );
2176 
2177     // SIMD integer subtraction
2178     let isub = shared.by_name("isub");
2179     for (ty, opcodes) in &[(I8, &PSUBB), (I16, &PSUBW), (I32, &PSUBD), (I64, &PSUBQ)] {
2180         let isub = isub.bind(vector(*ty, sse_vector_size));
2181         e.enc_both_inferred(isub, rec_fa.opcodes(*opcodes));
2182     }
2183 
2184     // SIMD integer saturating subtraction
2185     e.enc_both_inferred(
2186         ssub_sat.bind(vector(I8, sse_vector_size)),
2187         rec_fa.opcodes(&PSUBSB),
2188     );
2189     e.enc_both_inferred(
2190         ssub_sat.bind(vector(I16, sse_vector_size)),
2191         rec_fa.opcodes(&PSUBSW),
2192     );
2193     e.enc_both_inferred(
2194         usub_sat.bind(vector(I8, sse_vector_size)),
2195         rec_fa.opcodes(&PSUBUSB),
2196     );
2197     e.enc_both_inferred(
2198         usub_sat.bind(vector(I16, sse_vector_size)),
2199         rec_fa.opcodes(&PSUBUSW),
2200     );
2201 
2202     // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
2203     // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
2204     for (ty, opcodes, isap) in &[
2205         (I16, &PMULLW[..], None),
2206         (I32, &PMULLD[..], Some(use_sse41_simd)),
2207     ] {
2208         let imul = imul.bind(vector(*ty, sse_vector_size));
2209         e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
2210     }
2211 
2212     // SIMD multiplication with lane expansion.
2213     e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
2214 
2215     // SIMD multiplication and add adjacent pairs, from SSE2.
2216     e.enc_both_inferred(widening_pairwise_dot_product_s, rec_fa.opcodes(&PMADDWD));
2217 
2218     // SIMD integer multiplication for I64x2 using a AVX512.
2219     {
2220         e.enc_32_64_maybe_isap(
2221             x86_pmullq,
2222             rec_evex_reg_vvvv_rm_128.opcodes(&VPMULLQ).w(),
2223             Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
2224         );
2225     }
2226 
2227     // SIMD integer average with rounding.
2228     for (ty, opcodes) in &[(I8, &PAVGB[..]), (I16, &PAVGW[..])] {
2229         let avgr = avg_round.bind(vector(*ty, sse_vector_size));
2230         e.enc_both_inferred(avgr, rec_fa.opcodes(opcodes));
2231     }
2232 
2233     // SIMD integer absolute value.
2234     for (ty, opcodes) in &[(I8, &PABSB[..]), (I16, &PABSW[..]), (I32, &PABSD)] {
2235         let iabs = iabs.bind(vector(*ty, sse_vector_size));
2236         e.enc_both_inferred_maybe_isap(iabs, rec_furm.opcodes(opcodes), Some(use_ssse3_simd));
2237     }
2238 
2239     // SIMD logical operations
2240     let band = shared.by_name("band");
2241     let band_not = shared.by_name("band_not");
2242     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2243         // and
2244         let band = band.bind(vector(ty, sse_vector_size));
2245         e.enc_both_inferred(band, rec_fa.opcodes(&PAND));
2246 
2247         // and not (note flipped recipe operands to match band_not order)
2248         let band_not = band_not.bind(vector(ty, sse_vector_size));
2249         e.enc_both_inferred(band_not, rec_fax.opcodes(&PANDN));
2250 
2251         // or
2252         let bor = bor.bind(vector(ty, sse_vector_size));
2253         e.enc_both_inferred(bor, rec_fa.opcodes(&POR));
2254 
2255         // xor
2256         let bxor = bxor.bind(vector(ty, sse_vector_size));
2257         e.enc_both_inferred(bxor, rec_fa.opcodes(&PXOR));
2258 
2259         // ptest
2260         let x86_ptest = x86_ptest.bind(vector(ty, sse_vector_size));
2261         e.enc_both_inferred_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd));
2262     }
2263 
2264     // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
2265     // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
2266     // I128x1 but restrictions on the type builder prevent this; the general idea here is that
2267     // the upper bits are all zeroed and do not form parts of any separate lane. See
2268     // https://github.com/bytecodealliance/wasmtime/issues/1140.
2269     e.enc_both_inferred(
2270         bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
2271         rec_frurm.opcodes(&MOVD_LOAD_XMM),
2272     );
2273     e.enc64(
2274         bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
2275         rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
2276     );
2277 
2278     // SIMD shift left
2279     for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
2280         let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
2281         e.enc_both_inferred(x86_psll, rec_fa.opcodes(*opcodes));
2282     }
2283 
2284     // SIMD shift right (logical)
2285     for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] {
2286         let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size));
2287         e.enc_both_inferred(x86_psrl, rec_fa.opcodes(*opcodes));
2288     }
2289 
2290     // SIMD shift right (arithmetic)
2291     for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] {
2292         let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size));
2293         e.enc_both_inferred(x86_psra, rec_fa.opcodes(*opcodes));
2294     }
2295 
2296     // SIMD immediate shift
2297     for (ty, opcodes) in &[(I16, &PS_W_IMM), (I32, &PS_D_IMM), (I64, &PS_Q_IMM)] {
2298         let ishl_imm = ishl_imm.bind(vector(*ty, sse_vector_size));
2299         e.enc_both_inferred(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6));
2300 
2301         let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size));
2302         e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2));
2303 
2304         // One exception: PSRAQ does not exist in for 64x2 in SSE2, it requires a higher CPU feature set.
2305         if *ty != I64 {
2306             let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size));
2307             e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4));
2308         }
2309     }
2310 
2311     // SIMD integer comparisons
2312     {
2313         use IntCC::*;
2314         for (ty, cc, opcodes, isa_predicate) in &[
2315             (I8, Equal, &PCMPEQB[..], None),
2316             (I16, Equal, &PCMPEQW[..], None),
2317             (I32, Equal, &PCMPEQD[..], None),
2318             (I64, Equal, &PCMPEQQ[..], Some(use_sse41_simd)),
2319             (I8, SignedGreaterThan, &PCMPGTB[..], None),
2320             (I16, SignedGreaterThan, &PCMPGTW[..], None),
2321             (I32, SignedGreaterThan, &PCMPGTD[..], None),
2322             (I64, SignedGreaterThan, &PCMPGTQ, Some(use_sse42_simd)),
2323         ] {
2324             let instruction = icmp
2325                 .bind(Immediate::IntCC(*cc))
2326                 .bind(vector(*ty, sse_vector_size));
2327             let template = rec_icscc_fpr.opcodes(opcodes);
2328             e.enc_both_inferred_maybe_isap(instruction, template, *isa_predicate);
2329         }
2330     }
2331 
2332     // SIMD min/max
2333     for (ty, inst, opcodes, isa_predicate) in &[
2334         (I8, x86_pmaxs, &PMAXSB[..], Some(use_sse41_simd)),
2335         (I16, x86_pmaxs, &PMAXSW[..], None),
2336         (I32, x86_pmaxs, &PMAXSD[..], Some(use_sse41_simd)),
2337         (I8, x86_pmaxu, &PMAXUB[..], None),
2338         (I16, x86_pmaxu, &PMAXUW[..], Some(use_sse41_simd)),
2339         (I32, x86_pmaxu, &PMAXUD[..], Some(use_sse41_simd)),
2340         (I8, x86_pmins, &PMINSB[..], Some(use_sse41_simd)),
2341         (I16, x86_pmins, &PMINSW[..], None),
2342         (I32, x86_pmins, &PMINSD[..], Some(use_sse41_simd)),
2343         (I8, x86_pminu, &PMINUB[..], None),
2344         (I16, x86_pminu, &PMINUW[..], Some(use_sse41_simd)),
2345         (I32, x86_pminu, &PMINUD[..], Some(use_sse41_simd)),
2346     ] {
2347         let inst = inst.bind(vector(*ty, sse_vector_size));
2348         e.enc_both_inferred_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate);
2349     }
2350 
2351     // SIMD float comparisons
2352     e.enc_both_inferred(
2353         fcmp.bind(vector(F32, sse_vector_size)),
2354         rec_pfcmp.opcodes(&CMPPS),
2355     );
2356     e.enc_both_inferred(
2357         fcmp.bind(vector(F64, sse_vector_size)),
2358         rec_pfcmp.opcodes(&CMPPD),
2359     );
2360 
2361     // SIMD float arithmetic
2362     for (ty, inst, opcodes) in &[
2363         (F32, fadd, &ADDPS[..]),
2364         (F64, fadd, &ADDPD[..]),
2365         (F32, fsub, &SUBPS[..]),
2366         (F64, fsub, &SUBPD[..]),
2367         (F32, fmul, &MULPS[..]),
2368         (F64, fmul, &MULPD[..]),
2369         (F32, fdiv, &DIVPS[..]),
2370         (F64, fdiv, &DIVPD[..]),
2371         (F32, x86_fmin, &MINPS[..]),
2372         (F64, x86_fmin, &MINPD[..]),
2373         (F32, x86_fmax, &MAXPS[..]),
2374         (F64, x86_fmax, &MAXPD[..]),
2375     ] {
2376         let inst = inst.bind(vector(*ty, sse_vector_size));
2377         e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
2378     }
2379     for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] {
2380         let inst = inst.bind(vector(*ty, sse_vector_size));
2381         e.enc_both_inferred(inst, rec_furm.opcodes(opcodes));
2382     }
2383 }
2384 
2385 #[inline(never)]
define_entity_ref( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, r: &RecipeGroup, )2386 fn define_entity_ref(
2387     e: &mut PerCpuModeEncodings,
2388     shared_defs: &SharedDefinitions,
2389     settings: &SettingGroup,
2390     r: &RecipeGroup,
2391 ) {
2392     let shared = &shared_defs.instructions;
2393     let formats = &shared_defs.formats;
2394 
2395     // Shorthands for instructions.
2396     let const_addr = shared.by_name("const_addr");
2397     let func_addr = shared.by_name("func_addr");
2398     let stack_addr = shared.by_name("stack_addr");
2399     let symbol_value = shared.by_name("symbol_value");
2400 
2401     // Shorthands for recipes.
2402     let rec_allones_fnaddr4 = r.template("allones_fnaddr4");
2403     let rec_allones_fnaddr8 = r.template("allones_fnaddr8");
2404     let rec_fnaddr4 = r.template("fnaddr4");
2405     let rec_fnaddr8 = r.template("fnaddr8");
2406     let rec_const_addr = r.template("const_addr");
2407     let rec_got_fnaddr8 = r.template("got_fnaddr8");
2408     let rec_got_gvaddr8 = r.template("got_gvaddr8");
2409     let rec_gvaddr4 = r.template("gvaddr4");
2410     let rec_gvaddr8 = r.template("gvaddr8");
2411     let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
2412     let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
2413     let rec_spaddr_id = r.template("spaddr_id");
2414 
2415     // Predicates shorthands.
2416     let all_ones_funcaddrs_and_not_is_pic =
2417         settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
2418     let is_pic = settings.predicate_by_name("is_pic");
2419     let not_all_ones_funcaddrs_and_not_is_pic =
2420         settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
2421     let not_is_pic = settings.predicate_by_name("not_is_pic");
2422 
2423     // Function addresses.
2424 
2425     // Non-PIC, all-ones funcaddresses.
2426     e.enc32_isap(
2427         func_addr.bind(I32),
2428         rec_fnaddr4.opcodes(&MOV_IMM),
2429         not_all_ones_funcaddrs_and_not_is_pic,
2430     );
2431     e.enc64_isap(
2432         func_addr.bind(I64),
2433         rec_fnaddr8.opcodes(&MOV_IMM).rex().w(),
2434         not_all_ones_funcaddrs_and_not_is_pic,
2435     );
2436 
2437     // Non-PIC, all-zeros funcaddresses.
2438     e.enc32_isap(
2439         func_addr.bind(I32),
2440         rec_allones_fnaddr4.opcodes(&MOV_IMM),
2441         all_ones_funcaddrs_and_not_is_pic,
2442     );
2443     e.enc64_isap(
2444         func_addr.bind(I64),
2445         rec_allones_fnaddr8.opcodes(&MOV_IMM).rex().w(),
2446         all_ones_funcaddrs_and_not_is_pic,
2447     );
2448 
2449     // 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's pc-relative field.
2450     let is_colocated_func =
2451         InstructionPredicate::new_is_colocated_func(&*formats.func_addr, "func_ref");
2452     e.enc64_instp(
2453         func_addr.bind(I64),
2454         rec_pcrel_fnaddr8.opcodes(&LEA).rex().w(),
2455         is_colocated_func,
2456     );
2457 
2458     // 64-bit, non-colocated, PIC.
2459     e.enc64_isap(
2460         func_addr.bind(I64),
2461         rec_got_fnaddr8.opcodes(&MOV_LOAD).rex().w(),
2462         is_pic,
2463     );
2464 
2465     // Global addresses.
2466 
2467     // Non-PIC.
2468     e.enc32_isap(
2469         symbol_value.bind(I32),
2470         rec_gvaddr4.opcodes(&MOV_IMM),
2471         not_is_pic,
2472     );
2473     e.enc64_isap(
2474         symbol_value.bind(I64),
2475         rec_gvaddr8.opcodes(&MOV_IMM).rex().w(),
2476         not_is_pic,
2477     );
2478 
2479     // PIC, colocated.
2480     e.enc64_func(
2481         symbol_value.bind(I64),
2482         rec_pcrel_gvaddr8.opcodes(&LEA).rex().w(),
2483         |encoding| {
2484             encoding
2485                 .isa_predicate(is_pic)
2486                 .inst_predicate(InstructionPredicate::new_is_colocated_data(formats))
2487         },
2488     );
2489 
2490     // PIC, non-colocated.
2491     e.enc64_isap(
2492         symbol_value.bind(I64),
2493         rec_got_gvaddr8.opcodes(&MOV_LOAD).rex().w(),
2494         is_pic,
2495     );
2496 
2497     // Stack addresses.
2498     //
2499     // TODO: Add encoding rules for stack_load and stack_store, so that they
2500     // don't get legalized to stack_addr + load/store.
2501     e.enc64(stack_addr.bind(I64), rec_spaddr_id.opcodes(&LEA).rex().w());
2502     e.enc32(stack_addr.bind(I32), rec_spaddr_id.opcodes(&LEA));
2503 
2504     // Constant addresses (PIC).
2505     e.enc64(const_addr.bind(I64), rec_const_addr.opcodes(&LEA).rex().w());
2506     e.enc32(const_addr.bind(I32), rec_const_addr.opcodes(&LEA));
2507 }
2508 
2509 /// Control flow opcodes.
2510 #[inline(never)]
define_control_flow( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, r: &RecipeGroup, )2511 fn define_control_flow(
2512     e: &mut PerCpuModeEncodings,
2513     shared_defs: &SharedDefinitions,
2514     settings: &SettingGroup,
2515     r: &RecipeGroup,
2516 ) {
2517     let shared = &shared_defs.instructions;
2518     let formats = &shared_defs.formats;
2519 
2520     // Shorthands for instructions.
2521     let brff = shared.by_name("brff");
2522     let brif = shared.by_name("brif");
2523     let brnz = shared.by_name("brnz");
2524     let brz = shared.by_name("brz");
2525     let call = shared.by_name("call");
2526     let call_indirect = shared.by_name("call_indirect");
2527     let debugtrap = shared.by_name("debugtrap");
2528     let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
2529     let jump = shared.by_name("jump");
2530     let jump_table_base = shared.by_name("jump_table_base");
2531     let jump_table_entry = shared.by_name("jump_table_entry");
2532     let return_ = shared.by_name("return");
2533     let trap = shared.by_name("trap");
2534     let trapff = shared.by_name("trapff");
2535     let trapif = shared.by_name("trapif");
2536     let resumable_trap = shared.by_name("resumable_trap");
2537 
2538     // Shorthands for recipes.
2539     let rec_brfb = r.template("brfb");
2540     let rec_brfd = r.template("brfd");
2541     let rec_brib = r.template("brib");
2542     let rec_brid = r.template("brid");
2543     let rec_call_id = r.template("call_id");
2544     let rec_call_plt_id = r.template("call_plt_id");
2545     let rec_call_r = r.template("call_r");
2546     let rec_debugtrap = r.recipe("debugtrap");
2547     let rec_indirect_jmp = r.template("indirect_jmp");
2548     let rec_jmpb = r.template("jmpb");
2549     let rec_jmpd = r.template("jmpd");
2550     let rec_jt_base = r.template("jt_base");
2551     let rec_jt_entry = r.template("jt_entry");
2552     let rec_ret = r.template("ret");
2553     let rec_t8jccb_abcd = r.template("t8jccb_abcd");
2554     let rec_t8jccd_abcd = r.template("t8jccd_abcd");
2555     let rec_t8jccd_long = r.template("t8jccd_long");
2556     let rec_tjccb = r.template("tjccb");
2557     let rec_tjccd = r.template("tjccd");
2558     let rec_trap = r.template("trap");
2559     let rec_trapif = r.recipe("trapif");
2560     let rec_trapff = r.recipe("trapff");
2561 
2562     // Predicates shorthands.
2563     let is_pic = settings.predicate_by_name("is_pic");
2564 
2565     // Call/return
2566 
2567     // 32-bit, both PIC and non-PIC.
2568     e.enc32(call, rec_call_id.opcodes(&CALL_RELATIVE));
2569 
2570     // 64-bit, colocated, both PIC and non-PIC. Use the call instruction's pc-relative field.
2571     let is_colocated_func = InstructionPredicate::new_is_colocated_func(&*formats.call, "func_ref");
2572     e.enc64_instp(call, rec_call_id.opcodes(&CALL_RELATIVE), is_colocated_func);
2573 
2574     // 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version, since non-PIC
2575     // is currently using the large model, which requires calls be lowered to
2576     // func_addr+call_indirect.
2577     e.enc64_isap(call, rec_call_plt_id.opcodes(&CALL_RELATIVE), is_pic);
2578 
2579     e.enc32(
2580         call_indirect.bind(I32),
2581         rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
2582     );
2583     e.enc64(
2584         call_indirect.bind(I64),
2585         rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2).rex(),
2586     );
2587     e.enc64(
2588         call_indirect.bind(I64),
2589         rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
2590     );
2591 
2592     e.enc32(return_, rec_ret.opcodes(&RET_NEAR));
2593     e.enc64(return_, rec_ret.opcodes(&RET_NEAR));
2594 
2595     // Branches.
2596     e.enc32(jump, rec_jmpb.opcodes(&JUMP_SHORT));
2597     e.enc64(jump, rec_jmpb.opcodes(&JUMP_SHORT));
2598     e.enc32(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
2599     e.enc64(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
2600 
2601     e.enc_both(brif, rec_brib.opcodes(&JUMP_SHORT_IF_OVERFLOW));
2602     e.enc_both(brif, rec_brid.opcodes(&JUMP_NEAR_IF_OVERFLOW));
2603 
2604     // Not all float condition codes are legal, see `supported_floatccs`.
2605     e.enc_both(brff, rec_brfb.opcodes(&JUMP_SHORT_IF_OVERFLOW));
2606     e.enc_both(brff, rec_brfd.opcodes(&JUMP_NEAR_IF_OVERFLOW));
2607 
2608     // Note that the tjccd opcode will be prefixed with 0x0f.
2609     e.enc_i32_i64_explicit_rex(brz, rec_tjccb.opcodes(&JUMP_SHORT_IF_EQUAL));
2610     e.enc_i32_i64_explicit_rex(brz, rec_tjccd.opcodes(&TEST_BYTE_REG));
2611     e.enc_i32_i64_explicit_rex(brnz, rec_tjccb.opcodes(&JUMP_SHORT_IF_NOT_EQUAL));
2612     e.enc_i32_i64_explicit_rex(brnz, rec_tjccd.opcodes(&TEST_REG));
2613 
2614     // Branch on a b1 value in a register only looks at the low 8 bits. See also
2615     // bint encodings below.
2616     //
2617     // Start with the worst-case encoding for X86_32 only. The register allocator
2618     // can't handle a branch with an ABCD-constrained operand.
2619     e.enc32(brz.bind(B1), rec_t8jccd_long.opcodes(&TEST_BYTE_REG));
2620     e.enc32(brnz.bind(B1), rec_t8jccd_long.opcodes(&TEST_REG));
2621 
2622     e.enc_both(brz.bind(B1), rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_EQUAL));
2623     e.enc_both(brz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_BYTE_REG));
2624     e.enc_both(
2625         brnz.bind(B1),
2626         rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_NOT_EQUAL),
2627     );
2628     e.enc_both(brnz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_REG));
2629 
2630     // Jump tables.
2631     e.enc64(
2632         jump_table_entry.bind(I64),
2633         rec_jt_entry.opcodes(&MOVSXD).rex().w(),
2634     );
2635     e.enc32(jump_table_entry.bind(I32), rec_jt_entry.opcodes(&MOV_LOAD));
2636 
2637     e.enc64(
2638         jump_table_base.bind(I64),
2639         rec_jt_base.opcodes(&LEA).rex().w(),
2640     );
2641     e.enc32(jump_table_base.bind(I32), rec_jt_base.opcodes(&LEA));
2642 
2643     e.enc_x86_64(
2644         indirect_jump_table_br.bind(I64),
2645         rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
2646     );
2647     e.enc32(
2648         indirect_jump_table_br.bind(I32),
2649         rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
2650     );
2651 
2652     // Trap as ud2
2653     e.enc32(trap, rec_trap.opcodes(&UNDEFINED2));
2654     e.enc64(trap, rec_trap.opcodes(&UNDEFINED2));
2655     e.enc32(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
2656     e.enc64(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
2657 
2658     // Debug trap as int3
2659     e.enc32_rec(debugtrap, rec_debugtrap, 0);
2660     e.enc64_rec(debugtrap, rec_debugtrap, 0);
2661 
2662     e.enc32_rec(trapif, rec_trapif, 0);
2663     e.enc64_rec(trapif, rec_trapif, 0);
2664     e.enc32_rec(trapff, rec_trapff, 0);
2665     e.enc64_rec(trapff, rec_trapff, 0);
2666 }
2667 
2668 /// Reference type instructions.
2669 #[inline(never)]
define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)2670 fn define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
2671     let shared = &shared_defs.instructions;
2672 
2673     let is_null = shared.by_name("is_null");
2674     let is_invalid = shared.by_name("is_invalid");
2675     let null = shared.by_name("null");
2676     let safepoint = shared.by_name("safepoint");
2677 
2678     let rec_is_zero = r.template("is_zero");
2679     let rec_is_invalid = r.template("is_invalid");
2680     let rec_pu_id_ref = r.template("pu_id_ref");
2681     let rec_safepoint = r.recipe("safepoint");
2682 
2683     // Null references implemented as iconst 0.
2684     e.enc32(null.bind(R32), rec_pu_id_ref.opcodes(&MOV_IMM));
2685 
2686     e.enc64(null.bind(R64), rec_pu_id_ref.rex().opcodes(&MOV_IMM));
2687     e.enc64(null.bind(R64), rec_pu_id_ref.opcodes(&MOV_IMM));
2688 
2689     // is_null, implemented by testing whether the value is 0.
2690     e.enc_r32_r64_rex_only(is_null, rec_is_zero.opcodes(&TEST_REG));
2691 
2692     // is_invalid, implemented by testing whether the value is -1.
2693     e.enc_r32_r64_rex_only(is_invalid, rec_is_invalid.opcodes(&CMP_IMM8).rrr(7));
2694 
2695     // safepoint instruction calls sink, no actual encoding.
2696     e.enc32_rec(safepoint, rec_safepoint, 0);
2697     e.enc64_rec(safepoint, rec_safepoint, 0);
2698 }
2699 
2700 #[allow(clippy::cognitive_complexity)]
define( shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, ) -> PerCpuModeEncodings2701 pub(crate) fn define(
2702     shared_defs: &SharedDefinitions,
2703     settings: &SettingGroup,
2704     x86: &InstructionGroup,
2705     r: &RecipeGroup,
2706 ) -> PerCpuModeEncodings {
2707     // Definitions.
2708     let mut e = PerCpuModeEncodings::new();
2709 
2710     define_moves(&mut e, shared_defs, r);
2711     define_memory(&mut e, shared_defs, x86, r);
2712     define_fpu_moves(&mut e, shared_defs, r);
2713     define_fpu_memory(&mut e, shared_defs, r);
2714     define_fpu_ops(&mut e, shared_defs, settings, x86, r);
2715     define_alu(&mut e, shared_defs, settings, x86, r);
2716     define_simd(&mut e, shared_defs, settings, x86, r);
2717     define_entity_ref(&mut e, shared_defs, settings, r);
2718     define_control_flow(&mut e, shared_defs, settings, r);
2719     define_reftypes(&mut e, shared_defs, r);
2720 
2721     let x86_elf_tls_get_addr = x86.by_name("x86_elf_tls_get_addr");
2722     let x86_macho_tls_get_addr = x86.by_name("x86_macho_tls_get_addr");
2723 
2724     let rec_elf_tls_get_addr = r.recipe("elf_tls_get_addr");
2725     let rec_macho_tls_get_addr = r.recipe("macho_tls_get_addr");
2726 
2727     e.enc64_rec(x86_elf_tls_get_addr, rec_elf_tls_get_addr, 0);
2728     e.enc64_rec(x86_macho_tls_get_addr, rec_macho_tls_get_addr, 0);
2729 
2730     e
2731 }
2732