1 #![allow(non_snake_case)]
2
3 use cranelift_codegen_shared::condcodes::IntCC;
4 use std::collections::HashMap;
5
6 use crate::cdsl::encodings::{Encoding, EncodingBuilder};
7 use crate::cdsl::instructions::{
8 vector, Bindable, Immediate, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
9 InstructionPredicateNode, InstructionPredicateRegistry,
10 };
11 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
12 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
13 use crate::cdsl::types::{LaneType, ValueType};
14 use crate::shared::types::Bool::{B1, B16, B32, B64, B8};
15 use crate::shared::types::Float::{F32, F64};
16 use crate::shared::types::Int::{I16, I32, I64, I8};
17 use crate::shared::types::Reference::{R32, R64};
18 use crate::shared::Definitions as SharedDefinitions;
19
20 use crate::isa::x86::opcodes::*;
21
22 use super::recipes::{RecipeGroup, Template};
23 use crate::cdsl::instructions::BindParameter::Any;
24
25 pub(crate) struct PerCpuModeEncodings {
26 pub enc32: Vec<Encoding>,
27 pub enc64: Vec<Encoding>,
28 pub recipes: Recipes,
29 recipes_by_name: HashMap<String, EncodingRecipeNumber>,
30 pub inst_pred_reg: InstructionPredicateRegistry,
31 }
32
33 impl PerCpuModeEncodings {
new() -> Self34 fn new() -> Self {
35 Self {
36 enc32: Vec::new(),
37 enc64: Vec::new(),
38 recipes: Recipes::new(),
39 recipes_by_name: HashMap::new(),
40 inst_pred_reg: InstructionPredicateRegistry::new(),
41 }
42 }
43
add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber44 fn add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber {
45 if let Some(found_index) = self.recipes_by_name.get(&recipe.name) {
46 assert!(
47 self.recipes[*found_index] == recipe,
48 "trying to insert different recipes with a same name ({})",
49 recipe.name
50 );
51 *found_index
52 } else {
53 let recipe_name = recipe.name.clone();
54 let index = self.recipes.push(recipe);
55 self.recipes_by_name.insert(recipe_name, index);
56 index
57 }
58 }
59
make_encoding<T>( &mut self, inst: InstSpec, template: Template, builder_closure: T, ) -> Encoding where T: FnOnce(EncodingBuilder) -> EncodingBuilder,60 fn make_encoding<T>(
61 &mut self,
62 inst: InstSpec,
63 template: Template,
64 builder_closure: T,
65 ) -> Encoding
66 where
67 T: FnOnce(EncodingBuilder) -> EncodingBuilder,
68 {
69 let (recipe, bits) = template.build();
70 let recipe_number = self.add_recipe(recipe);
71 let builder = EncodingBuilder::new(inst, recipe_number, bits);
72 builder_closure(builder).build(&self.recipes, &mut self.inst_pred_reg)
73 }
74
enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,75 fn enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
76 where
77 T: FnOnce(EncodingBuilder) -> EncodingBuilder,
78 {
79 let encoding = self.make_encoding(inst.into(), template, builder_closure);
80 self.enc32.push(encoding);
81 }
enc32(&mut self, inst: impl Into<InstSpec>, template: Template)82 fn enc32(&mut self, inst: impl Into<InstSpec>, template: Template) {
83 self.enc32_func(inst, template, |x| x);
84 }
enc32_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )85 fn enc32_isap(
86 &mut self,
87 inst: impl Into<InstSpec>,
88 template: Template,
89 isap: SettingPredicateNumber,
90 ) {
91 self.enc32_func(inst, template, |encoding| encoding.isa_predicate(isap));
92 }
enc32_instp( &mut self, inst: impl Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )93 fn enc32_instp(
94 &mut self,
95 inst: impl Into<InstSpec>,
96 template: Template,
97 instp: InstructionPredicateNode,
98 ) {
99 self.enc32_func(inst, template, |encoding| encoding.inst_predicate(instp));
100 }
enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16)101 fn enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
102 let recipe_number = self.add_recipe(recipe.clone());
103 let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
104 let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
105 self.enc32.push(encoding);
106 }
107
enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,108 fn enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
109 where
110 T: FnOnce(EncodingBuilder) -> EncodingBuilder,
111 {
112 let encoding = self.make_encoding(inst.into(), template, builder_closure);
113 self.enc64.push(encoding);
114 }
enc64(&mut self, inst: impl Into<InstSpec>, template: Template)115 fn enc64(&mut self, inst: impl Into<InstSpec>, template: Template) {
116 self.enc64_func(inst, template, |x| x);
117 }
enc64_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )118 fn enc64_isap(
119 &mut self,
120 inst: impl Into<InstSpec>,
121 template: Template,
122 isap: SettingPredicateNumber,
123 ) {
124 self.enc64_func(inst, template, |encoding| encoding.isa_predicate(isap));
125 }
enc64_instp( &mut self, inst: impl Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )126 fn enc64_instp(
127 &mut self,
128 inst: impl Into<InstSpec>,
129 template: Template,
130 instp: InstructionPredicateNode,
131 ) {
132 self.enc64_func(inst, template, |encoding| encoding.inst_predicate(instp));
133 }
enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16)134 fn enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
135 let recipe_number = self.add_recipe(recipe.clone());
136 let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
137 let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
138 self.enc64.push(encoding);
139 }
140
141 /// Adds I32/I64 encodings as appropriate for a typed instruction.
142 /// The REX prefix is always inferred at runtime.
143 ///
144 /// Add encodings for `inst.i32` to X86_32.
145 /// Add encodings for `inst.i32` to X86_64 with optional, inferred REX.
146 /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template)147 fn enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template) {
148 let inst: InstSpec = inst.into();
149
150 // I32 on x86: no REX prefix.
151 self.enc32(inst.bind(I32), template.infer_rex());
152
153 // I32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
154 self.enc64(inst.bind(I32), template.infer_rex());
155
156 // I64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
157 self.enc64(inst.bind(I64), template.rex().w());
158 }
159
160 /// Adds I32/I64 encodings as appropriate for a typed instruction.
161 /// All variants of REX prefix are explicitly emitted, not inferred.
162 ///
163 /// Add encodings for `inst.i32` to X86_32.
164 /// Add encodings for `inst.i32` to X86_64 with and without REX.
165 /// Add encodings for `inst.i64` to X86_64 with and without REX.
enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template)166 fn enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template) {
167 let inst: InstSpec = inst.into();
168 self.enc32(inst.bind(I32), template.nonrex());
169
170 // REX-less encoding must come after REX encoding so we don't use it by default.
171 // Otherwise reg-alloc would never use r8 and up.
172 self.enc64(inst.bind(I32), template.rex());
173 self.enc64(inst.bind(I32), template.nonrex());
174 self.enc64(inst.bind(I64), template.rex().w());
175 }
176
177 /// Adds B32/B64 encodings as appropriate for a typed instruction.
178 /// The REX prefix is always inferred at runtime.
179 ///
180 /// Adds encoding for `inst.b32` to X86_32.
181 /// Adds encoding for `inst.b32` to X86_64 with optional, inferred REX.
182 /// Adds encoding for `inst.b64` to X86_64 with a REX.W prefix.
enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template)183 fn enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template) {
184 let inst: InstSpec = inst.into();
185
186 // B32 on x86: no REX prefix.
187 self.enc32(inst.bind(B32), template.infer_rex());
188
189 // B32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
190 self.enc64(inst.bind(B32), template.infer_rex());
191
192 // B64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
193 self.enc64(inst.bind(B64), template.rex().w());
194 }
195
196 /// Add encodings for `inst.i32` to X86_32.
197 /// Add encodings for `inst.i32` to X86_64 with a REX prefix.
198 /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template)199 fn enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
200 let inst: InstSpec = inst.into();
201 self.enc32(inst.bind(I32), template.nonrex());
202 self.enc64(inst.bind(I32), template.rex());
203 self.enc64(inst.bind(I64), template.rex().w());
204 }
205
206 /// Add encodings for `inst.i32` to X86_32.
207 /// Add encodings for `inst.i32` to X86_64 with and without REX.
208 /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
enc_i32_i64_instp( &mut self, inst: &Instruction, template: Template, instp: InstructionPredicateNode, )209 fn enc_i32_i64_instp(
210 &mut self,
211 inst: &Instruction,
212 template: Template,
213 instp: InstructionPredicateNode,
214 ) {
215 self.enc32_func(inst.bind(I32), template.nonrex(), |builder| {
216 builder.inst_predicate(instp.clone())
217 });
218
219 // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
220 // reg-alloc would never use r8 and up.
221 self.enc64_func(inst.bind(I32), template.rex(), |builder| {
222 builder.inst_predicate(instp.clone())
223 });
224 self.enc64_func(inst.bind(I32), template.nonrex(), |builder| {
225 builder.inst_predicate(instp.clone())
226 });
227 self.enc64_func(inst.bind(I64), template.rex().w(), |builder| {
228 builder.inst_predicate(instp)
229 });
230 }
231
232 /// Add encodings for `inst.r32` to X86_32.
233 /// Add encodings for `inst.r32` to X86_64 with and without REX.
234 /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
enc_r32_r64_instp( &mut self, inst: &Instruction, template: Template, instp: InstructionPredicateNode, )235 fn enc_r32_r64_instp(
236 &mut self,
237 inst: &Instruction,
238 template: Template,
239 instp: InstructionPredicateNode,
240 ) {
241 self.enc32_func(inst.bind(R32), template.nonrex(), |builder| {
242 builder.inst_predicate(instp.clone())
243 });
244
245 // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
246 // reg-alloc would never use r8 and up.
247 self.enc64_func(inst.bind(R32), template.rex(), |builder| {
248 builder.inst_predicate(instp.clone())
249 });
250 self.enc64_func(inst.bind(R32), template.nonrex(), |builder| {
251 builder.inst_predicate(instp.clone())
252 });
253 self.enc64_func(inst.bind(R64), template.rex().w(), |builder| {
254 builder.inst_predicate(instp)
255 });
256 }
257
258 /// Add encodings for `inst.r32` to X86_32.
259 /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template)260 fn enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
261 let inst: InstSpec = inst.into();
262 self.enc32(inst.bind(R32), template.nonrex());
263 self.enc64(inst.bind(R64), template.rex().w());
264 }
265
enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template)266 fn enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
267 self.enc32(inst.clone().bind(R32).bind(Any), template.clone());
268
269 // REX-less encoding must come after REX encoding so we don't use it by
270 // default. Otherwise reg-alloc would never use r8 and up.
271 self.enc64(inst.clone().bind(R32).bind(Any), template.clone().rex());
272 self.enc64(inst.clone().bind(R32).bind(Any), template.clone());
273
274 if w_bit {
275 self.enc64(inst.clone().bind(R64).bind(Any), template.rex().w());
276 } else {
277 self.enc64(inst.clone().bind(R64).bind(Any), template.clone().rex());
278 self.enc64(inst.clone().bind(R64).bind(Any), template);
279 }
280 }
281
282 /// Add encodings for `inst` to X86_64 with and without a REX prefix.
enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template)283 fn enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template) {
284 // See above comment about the ordering of rex vs non-rex encodings.
285 self.enc64(inst.clone(), template.rex());
286 self.enc64(inst, template);
287 }
288
289 /// Add encodings for `inst` to X86_64 with and without a REX prefix.
enc_x86_64_instp( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )290 fn enc_x86_64_instp(
291 &mut self,
292 inst: impl Clone + Into<InstSpec>,
293 template: Template,
294 instp: InstructionPredicateNode,
295 ) {
296 // See above comment about the ordering of rex vs non-rex encodings.
297 self.enc64_func(inst.clone(), template.rex(), |builder| {
298 builder.inst_predicate(instp.clone())
299 });
300 self.enc64_func(inst, template, |builder| builder.inst_predicate(instp));
301 }
enc_x86_64_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )302 fn enc_x86_64_isap(
303 &mut self,
304 inst: impl Clone + Into<InstSpec>,
305 template: Template,
306 isap: SettingPredicateNumber,
307 ) {
308 // See above comment about the ordering of rex vs non-rex encodings.
309 self.enc64_isap(inst.clone(), template.rex(), isap);
310 self.enc64_isap(inst, template, isap);
311 }
312
313 /// Add all three encodings for `inst`:
314 /// - X86_32
315 /// - X86_64 with and without the REX prefix.
enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)316 fn enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
317 self.enc32(inst.clone(), template.clone());
318 self.enc_x86_64(inst, template);
319 }
enc_both_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: SettingPredicateNumber, )320 fn enc_both_isap(
321 &mut self,
322 inst: impl Clone + Into<InstSpec>,
323 template: Template,
324 isap: SettingPredicateNumber,
325 ) {
326 self.enc32_isap(inst.clone(), template.clone(), isap);
327 self.enc_x86_64_isap(inst, template, isap);
328 }
enc_both_instp( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, instp: InstructionPredicateNode, )329 fn enc_both_instp(
330 &mut self,
331 inst: impl Clone + Into<InstSpec>,
332 template: Template,
333 instp: InstructionPredicateNode,
334 ) {
335 self.enc32_instp(inst.clone(), template.clone(), instp.clone());
336 self.enc_x86_64_instp(inst, template, instp);
337 }
338
339 /// Add two encodings for `inst`:
340 /// - X86_32, no REX prefix, since this is not valid in 32-bit mode.
341 /// - X86_64, dynamically infer the REX prefix.
enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)342 fn enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
343 self.enc32(inst.clone(), template.clone());
344 self.enc64(inst, template.infer_rex());
345 }
enc_both_inferred_maybe_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )346 fn enc_both_inferred_maybe_isap(
347 &mut self,
348 inst: impl Clone + Into<InstSpec>,
349 template: Template,
350 isap: Option<SettingPredicateNumber>,
351 ) {
352 self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
353 self.enc64_maybe_isap(inst, template.infer_rex(), isap);
354 }
355
356 /// Add two encodings for `inst`:
357 /// - X86_32
358 /// - X86_64 with the REX prefix.
enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template)359 fn enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
360 self.enc32(inst.clone(), template.clone());
361 self.enc64(inst, template.rex());
362 }
363
364 /// Add encodings for `inst.i32` to X86_32.
365 /// Add encodings for `inst.i32` to X86_64 with and without REX.
366 /// Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit`
367 /// argument to determine whether or not to set the REX.W bit.
enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template)368 fn enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
369 self.enc32(inst.clone().bind(I32).bind(Any), template.clone());
370
371 // REX-less encoding must come after REX encoding so we don't use it by
372 // default. Otherwise reg-alloc would never use r8 and up.
373 self.enc64(inst.clone().bind(I32).bind(Any), template.clone().rex());
374 self.enc64(inst.clone().bind(I32).bind(Any), template.clone());
375
376 if w_bit {
377 self.enc64(inst.clone().bind(I64).bind(Any), template.rex().w());
378 } else {
379 self.enc64(inst.clone().bind(I64).bind(Any), template.clone().rex());
380 self.enc64(inst.clone().bind(I64).bind(Any), template);
381 }
382 }
383
384 /// Add the same encoding/recipe pairing to both X86_32 and X86_64
enc_32_64_rec( &mut self, inst: impl Clone + Into<InstSpec>, recipe: &EncodingRecipe, bits: u16, )385 fn enc_32_64_rec(
386 &mut self,
387 inst: impl Clone + Into<InstSpec>,
388 recipe: &EncodingRecipe,
389 bits: u16,
390 ) {
391 self.enc32_rec(inst.clone(), recipe, bits);
392 self.enc64_rec(inst, recipe, bits);
393 }
394
395 /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
enc_32_64_func<T>( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, builder_closure: T, ) where T: FnOnce(EncodingBuilder) -> EncodingBuilder,396 fn enc_32_64_func<T>(
397 &mut self,
398 inst: impl Clone + Into<InstSpec>,
399 template: Template,
400 builder_closure: T,
401 ) where
402 T: FnOnce(EncodingBuilder) -> EncodingBuilder,
403 {
404 let encoding = self.make_encoding(inst.into(), template, builder_closure);
405 self.enc32.push(encoding.clone());
406 self.enc64.push(encoding);
407 }
408
409 /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
410 /// binding) has already happened.
enc_32_64_maybe_isap( &mut self, inst: impl Clone + Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )411 fn enc_32_64_maybe_isap(
412 &mut self,
413 inst: impl Clone + Into<InstSpec>,
414 template: Template,
415 isap: Option<SettingPredicateNumber>,
416 ) {
417 self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
418 self.enc64_maybe_isap(inst, template, isap);
419 }
420
enc32_maybe_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )421 fn enc32_maybe_isap(
422 &mut self,
423 inst: impl Into<InstSpec>,
424 template: Template,
425 isap: Option<SettingPredicateNumber>,
426 ) {
427 match isap {
428 None => self.enc32(inst, template),
429 Some(isap) => self.enc32_isap(inst, template, isap),
430 }
431 }
432
enc64_maybe_isap( &mut self, inst: impl Into<InstSpec>, template: Template, isap: Option<SettingPredicateNumber>, )433 fn enc64_maybe_isap(
434 &mut self,
435 inst: impl Into<InstSpec>,
436 template: Template,
437 isap: Option<SettingPredicateNumber>,
438 ) {
439 match isap {
440 None => self.enc64(inst, template),
441 Some(isap) => self.enc64_isap(inst, template, isap),
442 }
443 }
444 }
445
446 // Definitions.
447
448 #[inline(never)]
define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)449 fn define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
450 let shared = &shared_defs.instructions;
451 let formats = &shared_defs.formats;
452
453 // Shorthands for instructions.
454 let bconst = shared.by_name("bconst");
455 let bint = shared.by_name("bint");
456 let copy = shared.by_name("copy");
457 let copy_special = shared.by_name("copy_special");
458 let copy_to_ssa = shared.by_name("copy_to_ssa");
459 let get_pinned_reg = shared.by_name("get_pinned_reg");
460 let iconst = shared.by_name("iconst");
461 let ireduce = shared.by_name("ireduce");
462 let regmove = shared.by_name("regmove");
463 let sextend = shared.by_name("sextend");
464 let set_pinned_reg = shared.by_name("set_pinned_reg");
465 let uextend = shared.by_name("uextend");
466 let dummy_sarg_t = shared.by_name("dummy_sarg_t");
467
468 // Shorthands for recipes.
469 let rec_copysp = r.template("copysp");
470 let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
471 let rec_get_pinned_reg = r.recipe("get_pinned_reg");
472 let rec_null = r.recipe("null");
473 let rec_pu_id = r.template("pu_id");
474 let rec_pu_id_bool = r.template("pu_id_bool");
475 let rec_pu_iq = r.template("pu_iq");
476 let rec_rmov = r.template("rmov");
477 let rec_set_pinned_reg = r.template("set_pinned_reg");
478 let rec_u_id = r.template("u_id");
479 let rec_u_id_z = r.template("u_id_z");
480 let rec_umr = r.template("umr");
481 let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
482 let rec_urm_noflags = r.template("urm_noflags");
483 let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
484 let rec_dummy_sarg_t = r.recipe("dummy_sarg_t");
485
486 // The pinned reg is fixed to a certain value entirely user-controlled, so it generates nothing!
487 e.enc64_rec(get_pinned_reg.bind(I64), rec_get_pinned_reg, 0);
488 e.enc_x86_64(
489 set_pinned_reg.bind(I64),
490 rec_set_pinned_reg.opcodes(&MOV_STORE).rex().w(),
491 );
492
493 e.enc_i32_i64(copy, rec_umr.opcodes(&MOV_STORE));
494 e.enc_r32_r64_rex_only(copy, rec_umr.opcodes(&MOV_STORE));
495 e.enc_both(copy.bind(B1), rec_umr.opcodes(&MOV_STORE));
496 e.enc_both(copy.bind(I8), rec_umr.opcodes(&MOV_STORE));
497 e.enc_both(copy.bind(I16), rec_umr.opcodes(&MOV_STORE));
498
499 // TODO For x86-64, only define REX forms for now, since we can't describe the
500 // special regunit immediate operands with the current constraint language.
501 for &ty in &[I8, I16, I32] {
502 e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
503 e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
504 }
505 for &ty in &[B8, B16, B32] {
506 e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
507 e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
508 }
509 e.enc64(regmove.bind(I64), rec_rmov.opcodes(&MOV_STORE).rex().w());
510 e.enc_both(regmove.bind(B1), rec_rmov.opcodes(&MOV_STORE));
511 e.enc_both(regmove.bind(I8), rec_rmov.opcodes(&MOV_STORE));
512 e.enc32(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE));
513 e.enc64(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE).rex());
514 e.enc64(regmove.bind(R64), rec_rmov.opcodes(&MOV_STORE).rex().w());
515
516 // Immediate constants.
517 e.enc32(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
518
519 e.enc64(iconst.bind(I32), rec_pu_id.rex().opcodes(&MOV_IMM));
520 e.enc64(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
521
522 // The 32-bit immediate movl also zero-extends to 64 bits.
523 let is_unsigned_int32 =
524 InstructionPredicate::new_is_unsigned_int(&*formats.unary_imm, "imm", 32, 0);
525
526 e.enc64_func(
527 iconst.bind(I64),
528 rec_pu_id.opcodes(&MOV_IMM).rex(),
529 |encoding| encoding.inst_predicate(is_unsigned_int32.clone()),
530 );
531 e.enc64_func(iconst.bind(I64), rec_pu_id.opcodes(&MOV_IMM), |encoding| {
532 encoding.inst_predicate(is_unsigned_int32)
533 });
534
535 // Sign-extended 32-bit immediate.
536 e.enc64(
537 iconst.bind(I64),
538 rec_u_id.rex().opcodes(&MOV_IMM_SIGNEXTEND).rrr(0).w(),
539 );
540
541 // Finally, the MOV_IMM opcode takes an 8-byte immediate with a REX.W prefix.
542 e.enc64(iconst.bind(I64), rec_pu_iq.opcodes(&MOV_IMM).rex().w());
543
544 // Bool constants (uses MOV)
545 for &ty in &[B1, B8, B16, B32] {
546 e.enc_both(bconst.bind(ty), rec_pu_id_bool.opcodes(&MOV_IMM));
547 }
548 e.enc64(bconst.bind(B64), rec_pu_id_bool.opcodes(&MOV_IMM).rex());
549
550 // You may expect that i8 encodings would use 0x30 (XORB) to indicate that encodings should be
551 // on 8-bit operands (f.ex "xor %al, %al"). Cranelift currently does not know when it can
552 // safely drop the 0x66 prefix, so we explicitly select a wider but permissible opcode.
553 let is_zero_int = InstructionPredicate::new_is_zero_int(&formats.unary_imm, "imm");
554 e.enc_both_instp(
555 iconst.bind(I8),
556 rec_u_id_z.opcodes(&XOR),
557 is_zero_int.clone(),
558 );
559
560 // You may expect that i16 encodings would have an 0x66 prefix on the opcode to indicate that
561 // encodings should be on 16-bit operands (f.ex, "xor %ax, %ax"). Cranelift currently does not
562 // know that it can drop the 0x66 prefix and clear the upper half of a 32-bit register in these
563 // scenarios, so we explicitly select a wider but permissible opcode.
564 //
565 // This effectively formalizes the i16->i32 widening that Cranelift performs when there isn't
566 // an appropriate i16 encoding available.
567 e.enc_both_instp(
568 iconst.bind(I16),
569 rec_u_id_z.opcodes(&XOR),
570 is_zero_int.clone(),
571 );
572 e.enc_both_instp(
573 iconst.bind(I32),
574 rec_u_id_z.opcodes(&XOR),
575 is_zero_int.clone(),
576 );
577 e.enc_x86_64_instp(iconst.bind(I64), rec_u_id_z.opcodes(&XOR), is_zero_int);
578
579 // Numerical conversions.
580
581 // Reducing an integer is a no-op.
582 e.enc32_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
583 e.enc32_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
584 e.enc32_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
585
586 e.enc64_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
587 e.enc64_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
588 e.enc64_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
589 e.enc64_rec(ireduce.bind(I8).bind(I64), rec_null, 0);
590 e.enc64_rec(ireduce.bind(I16).bind(I64), rec_null, 0);
591 e.enc64_rec(ireduce.bind(I32).bind(I64), rec_null, 0);
592
593 // TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending
594 // instructions for %al/%ax/%eax to %ax/%eax/%rax.
595
596 // movsbl
597 e.enc32(
598 sextend.bind(I32).bind(I8),
599 rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
600 );
601 e.enc64(
602 sextend.bind(I32).bind(I8),
603 rec_urm_noflags.opcodes(&MOVSX_BYTE).rex(),
604 );
605 e.enc64(
606 sextend.bind(I32).bind(I8),
607 rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
608 );
609
610 // movswl
611 e.enc32(
612 sextend.bind(I32).bind(I16),
613 rec_urm_noflags.opcodes(&MOVSX_WORD),
614 );
615 e.enc64(
616 sextend.bind(I32).bind(I16),
617 rec_urm_noflags.opcodes(&MOVSX_WORD).rex(),
618 );
619 e.enc64(
620 sextend.bind(I32).bind(I16),
621 rec_urm_noflags.opcodes(&MOVSX_WORD),
622 );
623
624 // movsbq
625 e.enc64(
626 sextend.bind(I64).bind(I8),
627 rec_urm_noflags.opcodes(&MOVSX_BYTE).rex().w(),
628 );
629
630 // movswq
631 e.enc64(
632 sextend.bind(I64).bind(I16),
633 rec_urm_noflags.opcodes(&MOVSX_WORD).rex().w(),
634 );
635
636 // movslq
637 e.enc64(
638 sextend.bind(I64).bind(I32),
639 rec_urm_noflags.opcodes(&MOVSXD).rex().w(),
640 );
641
642 // movzbl
643 e.enc32(
644 uextend.bind(I32).bind(I8),
645 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
646 );
647 e.enc64(
648 uextend.bind(I32).bind(I8),
649 rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
650 );
651 e.enc64(
652 uextend.bind(I32).bind(I8),
653 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
654 );
655
656 // movzwl
657 e.enc32(
658 uextend.bind(I32).bind(I16),
659 rec_urm_noflags.opcodes(&MOVZX_WORD),
660 );
661 e.enc64(
662 uextend.bind(I32).bind(I16),
663 rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
664 );
665 e.enc64(
666 uextend.bind(I32).bind(I16),
667 rec_urm_noflags.opcodes(&MOVZX_WORD),
668 );
669
670 // movzbq, encoded as movzbl because it's equivalent and shorter.
671 e.enc64(
672 uextend.bind(I64).bind(I8),
673 rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
674 );
675 e.enc64(
676 uextend.bind(I64).bind(I8),
677 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
678 );
679
680 // movzwq, encoded as movzwl because it's equivalent and shorter
681 e.enc64(
682 uextend.bind(I64).bind(I16),
683 rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
684 );
685 e.enc64(
686 uextend.bind(I64).bind(I16),
687 rec_urm_noflags.opcodes(&MOVZX_WORD),
688 );
689
690 // A 32-bit register copy clears the high 32 bits.
691 e.enc64(
692 uextend.bind(I64).bind(I32),
693 rec_umr.opcodes(&MOV_STORE).rex(),
694 );
695 e.enc64(uextend.bind(I64).bind(I32), rec_umr.opcodes(&MOV_STORE));
696
697 // Convert bool to int.
698 //
699 // This assumes that b1 is represented as an 8-bit low register with the value 0
700 // or 1.
701 //
702 // Encode movzbq as movzbl, because it's equivalent and shorter.
703 for &to in &[I8, I16, I32, I64] {
704 for &from in &[B1, B8] {
705 e.enc64(
706 bint.bind(to).bind(from),
707 rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
708 );
709 e.enc64(
710 bint.bind(to).bind(from),
711 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
712 );
713 if to != I64 {
714 e.enc32(
715 bint.bind(to).bind(from),
716 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
717 );
718 }
719 }
720 }
721 for (to, from) in &[(I16, B16), (I32, B32), (I64, B64)] {
722 e.enc_both(
723 bint.bind(*to).bind(*from),
724 rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
725 );
726 }
727
728 // Copy Special
729 // For x86-64, only define REX forms for now, since we can't describe the
730 // special regunit immediate operands with the current constraint language.
731 e.enc64(copy_special, rec_copysp.opcodes(&MOV_STORE).rex().w());
732 e.enc32(copy_special, rec_copysp.opcodes(&MOV_STORE));
733
734 // Copy to SSA. These have to be done with special _rex_only encoders, because the standard
735 // machinery for deciding whether a REX.{RXB} prefix is needed doesn't take into account
736 // the source register, which is specified directly in the instruction.
737 e.enc_i32_i64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
738 e.enc_r32_r64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
739 e.enc_both_rex_only(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
740 e.enc_both_rex_only(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
741 e.enc_both_rex_only(
742 copy_to_ssa.bind(I16),
743 rec_umr_reg_to_ssa.opcodes(&MOV_STORE),
744 );
745 e.enc_both_rex_only(
746 copy_to_ssa.bind(F64),
747 rec_furm_reg_to_ssa.opcodes(&MOVSD_LOAD),
748 );
749 e.enc_both_rex_only(
750 copy_to_ssa.bind(F32),
751 rec_furm_reg_to_ssa.opcodes(&MOVSS_LOAD),
752 );
753
754 e.enc_32_64_rec(dummy_sarg_t, rec_dummy_sarg_t, 0);
755 }
756
757 #[inline(never)]
define_memory( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, x86: &InstructionGroup, r: &RecipeGroup, )758 fn define_memory(
759 e: &mut PerCpuModeEncodings,
760 shared_defs: &SharedDefinitions,
761 x86: &InstructionGroup,
762 r: &RecipeGroup,
763 ) {
764 let shared = &shared_defs.instructions;
765 let formats = &shared_defs.formats;
766
767 // Shorthands for instructions.
768 let adjust_sp_down = shared.by_name("adjust_sp_down");
769 let adjust_sp_down_imm = shared.by_name("adjust_sp_down_imm");
770 let adjust_sp_up_imm = shared.by_name("adjust_sp_up_imm");
771 let copy_nop = shared.by_name("copy_nop");
772 let fill = shared.by_name("fill");
773 let fill_nop = shared.by_name("fill_nop");
774 let istore16 = shared.by_name("istore16");
775 let istore16_complex = shared.by_name("istore16_complex");
776 let istore32 = shared.by_name("istore32");
777 let istore32_complex = shared.by_name("istore32_complex");
778 let istore8 = shared.by_name("istore8");
779 let istore8_complex = shared.by_name("istore8_complex");
780 let load = shared.by_name("load");
781 let load_complex = shared.by_name("load_complex");
782 let regfill = shared.by_name("regfill");
783 let regspill = shared.by_name("regspill");
784 let sload16 = shared.by_name("sload16");
785 let sload16_complex = shared.by_name("sload16_complex");
786 let sload32 = shared.by_name("sload32");
787 let sload32_complex = shared.by_name("sload32_complex");
788 let sload8 = shared.by_name("sload8");
789 let sload8_complex = shared.by_name("sload8_complex");
790 let spill = shared.by_name("spill");
791 let store = shared.by_name("store");
792 let store_complex = shared.by_name("store_complex");
793 let uload16 = shared.by_name("uload16");
794 let uload16_complex = shared.by_name("uload16_complex");
795 let uload32 = shared.by_name("uload32");
796 let uload32_complex = shared.by_name("uload32_complex");
797 let uload8 = shared.by_name("uload8");
798 let uload8_complex = shared.by_name("uload8_complex");
799 let x86_pop = x86.by_name("x86_pop");
800 let x86_push = x86.by_name("x86_push");
801
802 // Shorthands for recipes.
803 let rec_adjustsp = r.template("adjustsp");
804 let rec_adjustsp_ib = r.template("adjustsp_ib");
805 let rec_adjustsp_id = r.template("adjustsp_id");
806 let rec_ffillnull = r.recipe("ffillnull");
807 let rec_fillnull = r.recipe("fillnull");
808 let rec_fillSib32 = r.template("fillSib32");
809 let rec_ld = r.template("ld");
810 let rec_ldDisp32 = r.template("ldDisp32");
811 let rec_ldDisp8 = r.template("ldDisp8");
812 let rec_ldWithIndex = r.template("ldWithIndex");
813 let rec_ldWithIndexDisp32 = r.template("ldWithIndexDisp32");
814 let rec_ldWithIndexDisp8 = r.template("ldWithIndexDisp8");
815 let rec_popq = r.template("popq");
816 let rec_pushq = r.template("pushq");
817 let rec_regfill32 = r.template("regfill32");
818 let rec_regspill32 = r.template("regspill32");
819 let rec_spillSib32 = r.template("spillSib32");
820 let rec_st = r.template("st");
821 let rec_stacknull = r.recipe("stacknull");
822 let rec_stDisp32 = r.template("stDisp32");
823 let rec_stDisp32_abcd = r.template("stDisp32_abcd");
824 let rec_stDisp8 = r.template("stDisp8");
825 let rec_stDisp8_abcd = r.template("stDisp8_abcd");
826 let rec_stWithIndex = r.template("stWithIndex");
827 let rec_stWithIndexDisp32 = r.template("stWithIndexDisp32");
828 let rec_stWithIndexDisp32_abcd = r.template("stWithIndexDisp32_abcd");
829 let rec_stWithIndexDisp8 = r.template("stWithIndexDisp8");
830 let rec_stWithIndexDisp8_abcd = r.template("stWithIndexDisp8_abcd");
831 let rec_stWithIndex_abcd = r.template("stWithIndex_abcd");
832 let rec_st_abcd = r.template("st_abcd");
833
834 // Loads and stores.
835 let is_load_complex_length_two =
836 InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
837
838 for recipe in &[rec_ldWithIndex, rec_ldWithIndexDisp8, rec_ldWithIndexDisp32] {
839 e.enc_i32_i64_instp(
840 load_complex,
841 recipe.opcodes(&MOV_LOAD),
842 is_load_complex_length_two.clone(),
843 );
844 e.enc_r32_r64_instp(
845 load_complex,
846 recipe.opcodes(&MOV_LOAD),
847 is_load_complex_length_two.clone(),
848 );
849 e.enc_x86_64_instp(
850 uload32_complex,
851 recipe.opcodes(&MOV_LOAD),
852 is_load_complex_length_two.clone(),
853 );
854
855 e.enc64_instp(
856 sload32_complex,
857 recipe.opcodes(&MOVSXD).rex().w(),
858 is_load_complex_length_two.clone(),
859 );
860
861 e.enc_i32_i64_instp(
862 uload16_complex,
863 recipe.opcodes(&MOVZX_WORD),
864 is_load_complex_length_two.clone(),
865 );
866 e.enc_i32_i64_instp(
867 sload16_complex,
868 recipe.opcodes(&MOVSX_WORD),
869 is_load_complex_length_two.clone(),
870 );
871
872 e.enc_i32_i64_instp(
873 uload8_complex,
874 recipe.opcodes(&MOVZX_BYTE),
875 is_load_complex_length_two.clone(),
876 );
877
878 e.enc_i32_i64_instp(
879 sload8_complex,
880 recipe.opcodes(&MOVSX_BYTE),
881 is_load_complex_length_two.clone(),
882 );
883 }
884
885 let is_store_complex_length_three =
886 InstructionPredicate::new_length_equals(&*formats.store_complex, 3);
887
888 for recipe in &[rec_stWithIndex, rec_stWithIndexDisp8, rec_stWithIndexDisp32] {
889 e.enc_i32_i64_instp(
890 store_complex,
891 recipe.opcodes(&MOV_STORE),
892 is_store_complex_length_three.clone(),
893 );
894 e.enc_r32_r64_instp(
895 store_complex,
896 recipe.opcodes(&MOV_STORE),
897 is_store_complex_length_three.clone(),
898 );
899 e.enc_x86_64_instp(
900 istore32_complex,
901 recipe.opcodes(&MOV_STORE),
902 is_store_complex_length_three.clone(),
903 );
904 e.enc_both_instp(
905 istore16_complex.bind(I32),
906 recipe.opcodes(&MOV_STORE_16),
907 is_store_complex_length_three.clone(),
908 );
909 e.enc_x86_64_instp(
910 istore16_complex.bind(I64),
911 recipe.opcodes(&MOV_STORE_16),
912 is_store_complex_length_three.clone(),
913 );
914 }
915
916 for recipe in &[
917 rec_stWithIndex_abcd,
918 rec_stWithIndexDisp8_abcd,
919 rec_stWithIndexDisp32_abcd,
920 ] {
921 e.enc_both_instp(
922 istore8_complex.bind(I32),
923 recipe.opcodes(&MOV_BYTE_STORE),
924 is_store_complex_length_three.clone(),
925 );
926 e.enc_x86_64_instp(
927 istore8_complex.bind(I64),
928 recipe.opcodes(&MOV_BYTE_STORE),
929 is_store_complex_length_three.clone(),
930 );
931 }
932
933 for recipe in &[rec_st, rec_stDisp8, rec_stDisp32] {
934 e.enc_i32_i64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
935 e.enc_r32_r64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
936 e.enc_x86_64(istore32.bind(I64).bind(Any), recipe.opcodes(&MOV_STORE));
937 e.enc_i32_i64_ld_st(istore16, false, recipe.opcodes(&MOV_STORE_16));
938 }
939
940 // Byte stores are more complicated because the registers they can address
941 // depends of the presence of a REX prefix. The st*_abcd recipes fall back to
942 // the corresponding st* recipes when a REX prefix is applied.
943
944 for recipe in &[rec_st_abcd, rec_stDisp8_abcd, rec_stDisp32_abcd] {
945 e.enc_both(istore8.bind(I32).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
946 e.enc_x86_64(istore8.bind(I64).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
947 }
948
949 e.enc_i32_i64_explicit_rex(spill, rec_spillSib32.opcodes(&MOV_STORE));
950 e.enc_i32_i64_explicit_rex(regspill, rec_regspill32.opcodes(&MOV_STORE));
951 e.enc_r32_r64_rex_only(spill, rec_spillSib32.opcodes(&MOV_STORE));
952 e.enc_r32_r64_rex_only(regspill, rec_regspill32.opcodes(&MOV_STORE));
953
954 // Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
955 // constraining the permitted registers.
956 // See MIN_SPILL_SLOT_SIZE which makes this safe.
957
958 e.enc_both(spill.bind(B1), rec_spillSib32.opcodes(&MOV_STORE));
959 e.enc_both(regspill.bind(B1), rec_regspill32.opcodes(&MOV_STORE));
960 for &ty in &[I8, I16] {
961 e.enc_both(spill.bind(ty), rec_spillSib32.opcodes(&MOV_STORE));
962 e.enc_both(regspill.bind(ty), rec_regspill32.opcodes(&MOV_STORE));
963 }
964
965 for recipe in &[rec_ld, rec_ldDisp8, rec_ldDisp32] {
966 e.enc_i32_i64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
967 e.enc_r32_r64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
968 e.enc_x86_64(uload32.bind(I64), recipe.opcodes(&MOV_LOAD));
969 e.enc64(sload32.bind(I64), recipe.opcodes(&MOVSXD).rex().w());
970 e.enc_i32_i64_ld_st(uload16, true, recipe.opcodes(&MOVZX_WORD));
971 e.enc_i32_i64_ld_st(sload16, true, recipe.opcodes(&MOVSX_WORD));
972 e.enc_i32_i64_ld_st(uload8, true, recipe.opcodes(&MOVZX_BYTE));
973 e.enc_i32_i64_ld_st(sload8, true, recipe.opcodes(&MOVSX_BYTE));
974 }
975
976 e.enc_i32_i64_explicit_rex(fill, rec_fillSib32.opcodes(&MOV_LOAD));
977 e.enc_i32_i64_explicit_rex(regfill, rec_regfill32.opcodes(&MOV_LOAD));
978 e.enc_r32_r64_rex_only(fill, rec_fillSib32.opcodes(&MOV_LOAD));
979 e.enc_r32_r64_rex_only(regfill, rec_regfill32.opcodes(&MOV_LOAD));
980
981 // No-op fills, created by late-stage redundant-fill removal.
982 for &ty in &[I64, I32, I16, I8] {
983 e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
984 e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
985 }
986 e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0);
987 e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0);
988 for &ty in &[F64, F32] {
989 e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0);
990 e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0);
991 }
992 for &ty in &[R64, R32] {
993 e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
994 e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
995 }
996
997 // Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
998
999 e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(&MOV_LOAD));
1000 e.enc_both(regfill.bind(B1), rec_regfill32.opcodes(&MOV_LOAD));
1001 for &ty in &[I8, I16] {
1002 e.enc_both(fill.bind(ty), rec_fillSib32.opcodes(&MOV_LOAD));
1003 e.enc_both(regfill.bind(ty), rec_regfill32.opcodes(&MOV_LOAD));
1004 }
1005
1006 // Push and Pop.
1007 e.enc32(x86_push.bind(I32), rec_pushq.opcodes(&PUSH_REG));
1008 e.enc_x86_64(x86_push.bind(I64), rec_pushq.opcodes(&PUSH_REG));
1009
1010 e.enc32(x86_pop.bind(I32), rec_popq.opcodes(&POP_REG));
1011 e.enc_x86_64(x86_pop.bind(I64), rec_popq.opcodes(&POP_REG));
1012
1013 // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
1014 // into a no-op.
1015 // The same encoding is generated for both the 64- and 32-bit architectures.
1016 for &ty in &[I64, I32, I16, I8] {
1017 e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
1018 e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
1019 }
1020 for &ty in &[F64, F32] {
1021 e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
1022 e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
1023 }
1024
1025 // Adjust SP down by a dynamic value (or up, with a negative operand).
1026 e.enc32(adjust_sp_down.bind(I32), rec_adjustsp.opcodes(&SUB));
1027 e.enc64(
1028 adjust_sp_down.bind(I64),
1029 rec_adjustsp.opcodes(&SUB).rex().w(),
1030 );
1031
1032 // Adjust SP up by an immediate (or down, with a negative immediate).
1033 e.enc32(adjust_sp_up_imm, rec_adjustsp_ib.opcodes(&CMP_IMM8));
1034 e.enc32(adjust_sp_up_imm, rec_adjustsp_id.opcodes(&CMP_IMM));
1035 e.enc64(
1036 adjust_sp_up_imm,
1037 rec_adjustsp_ib.opcodes(&CMP_IMM8).rex().w(),
1038 );
1039 e.enc64(
1040 adjust_sp_up_imm,
1041 rec_adjustsp_id.opcodes(&CMP_IMM).rex().w(),
1042 );
1043
1044 // Adjust SP down by an immediate (or up, with a negative immediate).
1045 e.enc32(
1046 adjust_sp_down_imm,
1047 rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5),
1048 );
1049 e.enc32(adjust_sp_down_imm, rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5));
1050 e.enc64(
1051 adjust_sp_down_imm,
1052 rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5).rex().w(),
1053 );
1054 e.enc64(
1055 adjust_sp_down_imm,
1056 rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5).rex().w(),
1057 );
1058 }
1059
1060 #[inline(never)]
define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)1061 fn define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
1062 let shared = &shared_defs.instructions;
1063
1064 // Shorthands for instructions.
1065 let bitcast = shared.by_name("bitcast");
1066 let copy = shared.by_name("copy");
1067 let regmove = shared.by_name("regmove");
1068
1069 // Shorthands for recipes.
1070 let rec_frmov = r.template("frmov");
1071 let rec_frurm = r.template("frurm");
1072 let rec_furm = r.template("furm");
1073 let rec_rfumr = r.template("rfumr");
1074
1075 // Floating-point moves.
1076 // movd
1077 e.enc_both(
1078 bitcast.bind(F32).bind(I32),
1079 rec_frurm.opcodes(&MOVD_LOAD_XMM),
1080 );
1081 e.enc_both(
1082 bitcast.bind(I32).bind(F32),
1083 rec_rfumr.opcodes(&MOVD_STORE_XMM),
1084 );
1085
1086 // movq
1087 e.enc64(
1088 bitcast.bind(F64).bind(I64),
1089 rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
1090 );
1091 e.enc64(
1092 bitcast.bind(I64).bind(F64),
1093 rec_rfumr.opcodes(&MOVD_STORE_XMM).rex().w(),
1094 );
1095
1096 // movaps
1097 e.enc_both(copy.bind(F32), rec_furm.opcodes(&MOVAPS_LOAD));
1098 e.enc_both(copy.bind(F64), rec_furm.opcodes(&MOVAPS_LOAD));
1099
1100 // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
1101 // immediate operands with the current constraint language.
1102 e.enc32(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD));
1103 e.enc64(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
1104
1105 // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
1106 // immediate operands with the current constraint language.
1107 e.enc32(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD));
1108 e.enc64(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
1109 }
1110
1111 #[inline(never)]
define_fpu_memory( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup, )1112 fn define_fpu_memory(
1113 e: &mut PerCpuModeEncodings,
1114 shared_defs: &SharedDefinitions,
1115 r: &RecipeGroup,
1116 ) {
1117 let shared = &shared_defs.instructions;
1118
1119 // Shorthands for instructions.
1120 let fill = shared.by_name("fill");
1121 let load = shared.by_name("load");
1122 let load_complex = shared.by_name("load_complex");
1123 let regfill = shared.by_name("regfill");
1124 let regspill = shared.by_name("regspill");
1125 let spill = shared.by_name("spill");
1126 let store = shared.by_name("store");
1127 let store_complex = shared.by_name("store_complex");
1128
1129 // Shorthands for recipes.
1130 let rec_ffillSib32 = r.template("ffillSib32");
1131 let rec_fld = r.template("fld");
1132 let rec_fldDisp32 = r.template("fldDisp32");
1133 let rec_fldDisp8 = r.template("fldDisp8");
1134 let rec_fldWithIndex = r.template("fldWithIndex");
1135 let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
1136 let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
1137 let rec_fregfill32 = r.template("fregfill32");
1138 let rec_fregspill32 = r.template("fregspill32");
1139 let rec_fspillSib32 = r.template("fspillSib32");
1140 let rec_fst = r.template("fst");
1141 let rec_fstDisp32 = r.template("fstDisp32");
1142 let rec_fstDisp8 = r.template("fstDisp8");
1143 let rec_fstWithIndex = r.template("fstWithIndex");
1144 let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
1145 let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
1146
1147 // Float loads and stores.
1148 e.enc_both(load.bind(F32).bind(Any), rec_fld.opcodes(&MOVSS_LOAD));
1149 e.enc_both(load.bind(F32).bind(Any), rec_fldDisp8.opcodes(&MOVSS_LOAD));
1150 e.enc_both(load.bind(F32).bind(Any), rec_fldDisp32.opcodes(&MOVSS_LOAD));
1151
1152 e.enc_both(
1153 load_complex.bind(F32),
1154 rec_fldWithIndex.opcodes(&MOVSS_LOAD),
1155 );
1156 e.enc_both(
1157 load_complex.bind(F32),
1158 rec_fldWithIndexDisp8.opcodes(&MOVSS_LOAD),
1159 );
1160 e.enc_both(
1161 load_complex.bind(F32),
1162 rec_fldWithIndexDisp32.opcodes(&MOVSS_LOAD),
1163 );
1164
1165 e.enc_both(load.bind(F64).bind(Any), rec_fld.opcodes(&MOVSD_LOAD));
1166 e.enc_both(load.bind(F64).bind(Any), rec_fldDisp8.opcodes(&MOVSD_LOAD));
1167 e.enc_both(load.bind(F64).bind(Any), rec_fldDisp32.opcodes(&MOVSD_LOAD));
1168
1169 e.enc_both(
1170 load_complex.bind(F64),
1171 rec_fldWithIndex.opcodes(&MOVSD_LOAD),
1172 );
1173 e.enc_both(
1174 load_complex.bind(F64),
1175 rec_fldWithIndexDisp8.opcodes(&MOVSD_LOAD),
1176 );
1177 e.enc_both(
1178 load_complex.bind(F64),
1179 rec_fldWithIndexDisp32.opcodes(&MOVSD_LOAD),
1180 );
1181
1182 e.enc_both(store.bind(F32).bind(Any), rec_fst.opcodes(&MOVSS_STORE));
1183 e.enc_both(
1184 store.bind(F32).bind(Any),
1185 rec_fstDisp8.opcodes(&MOVSS_STORE),
1186 );
1187 e.enc_both(
1188 store.bind(F32).bind(Any),
1189 rec_fstDisp32.opcodes(&MOVSS_STORE),
1190 );
1191
1192 e.enc_both(
1193 store_complex.bind(F32),
1194 rec_fstWithIndex.opcodes(&MOVSS_STORE),
1195 );
1196 e.enc_both(
1197 store_complex.bind(F32),
1198 rec_fstWithIndexDisp8.opcodes(&MOVSS_STORE),
1199 );
1200 e.enc_both(
1201 store_complex.bind(F32),
1202 rec_fstWithIndexDisp32.opcodes(&MOVSS_STORE),
1203 );
1204
1205 e.enc_both(store.bind(F64).bind(Any), rec_fst.opcodes(&MOVSD_STORE));
1206 e.enc_both(
1207 store.bind(F64).bind(Any),
1208 rec_fstDisp8.opcodes(&MOVSD_STORE),
1209 );
1210 e.enc_both(
1211 store.bind(F64).bind(Any),
1212 rec_fstDisp32.opcodes(&MOVSD_STORE),
1213 );
1214
1215 e.enc_both(
1216 store_complex.bind(F64),
1217 rec_fstWithIndex.opcodes(&MOVSD_STORE),
1218 );
1219 e.enc_both(
1220 store_complex.bind(F64),
1221 rec_fstWithIndexDisp8.opcodes(&MOVSD_STORE),
1222 );
1223 e.enc_both(
1224 store_complex.bind(F64),
1225 rec_fstWithIndexDisp32.opcodes(&MOVSD_STORE),
1226 );
1227
1228 e.enc_both(fill.bind(F32), rec_ffillSib32.opcodes(&MOVSS_LOAD));
1229 e.enc_both(regfill.bind(F32), rec_fregfill32.opcodes(&MOVSS_LOAD));
1230 e.enc_both(fill.bind(F64), rec_ffillSib32.opcodes(&MOVSD_LOAD));
1231 e.enc_both(regfill.bind(F64), rec_fregfill32.opcodes(&MOVSD_LOAD));
1232
1233 e.enc_both(spill.bind(F32), rec_fspillSib32.opcodes(&MOVSS_STORE));
1234 e.enc_both(regspill.bind(F32), rec_fregspill32.opcodes(&MOVSS_STORE));
1235 e.enc_both(spill.bind(F64), rec_fspillSib32.opcodes(&MOVSD_STORE));
1236 e.enc_both(regspill.bind(F64), rec_fregspill32.opcodes(&MOVSD_STORE));
1237 }
1238
1239 #[inline(never)]
define_fpu_ops( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1240 fn define_fpu_ops(
1241 e: &mut PerCpuModeEncodings,
1242 shared_defs: &SharedDefinitions,
1243 settings: &SettingGroup,
1244 x86: &InstructionGroup,
1245 r: &RecipeGroup,
1246 ) {
1247 let shared = &shared_defs.instructions;
1248 let formats = &shared_defs.formats;
1249
1250 // Shorthands for instructions.
1251 let ceil = shared.by_name("ceil");
1252 let f32const = shared.by_name("f32const");
1253 let f64const = shared.by_name("f64const");
1254 let fadd = shared.by_name("fadd");
1255 let fcmp = shared.by_name("fcmp");
1256 let fcvt_from_sint = shared.by_name("fcvt_from_sint");
1257 let fdemote = shared.by_name("fdemote");
1258 let fdiv = shared.by_name("fdiv");
1259 let ffcmp = shared.by_name("ffcmp");
1260 let floor = shared.by_name("floor");
1261 let fmul = shared.by_name("fmul");
1262 let fpromote = shared.by_name("fpromote");
1263 let fsub = shared.by_name("fsub");
1264 let nearest = shared.by_name("nearest");
1265 let sqrt = shared.by_name("sqrt");
1266 let trunc = shared.by_name("trunc");
1267 let x86_cvtt2si = x86.by_name("x86_cvtt2si");
1268 let x86_fmax = x86.by_name("x86_fmax");
1269 let x86_fmin = x86.by_name("x86_fmin");
1270
1271 // Shorthands for recipes.
1272 let rec_f32imm_z = r.template("f32imm_z");
1273 let rec_f64imm_z = r.template("f64imm_z");
1274 let rec_fa = r.template("fa");
1275 let rec_fcmp = r.template("fcmp");
1276 let rec_fcscc = r.template("fcscc");
1277 let rec_frurm = r.template("frurm");
1278 let rec_furm = r.template("furm");
1279 let rec_furmi_rnd = r.template("furmi_rnd");
1280 let rec_rfurm = r.template("rfurm");
1281
1282 // Predicates shorthands.
1283 let use_sse41 = settings.predicate_by_name("use_sse41");
1284
1285 // Floating-point constants equal to 0.0 can be encoded using either `xorps` or `xorpd`, for
1286 // 32-bit and 64-bit floats respectively.
1287 let is_zero_32_bit_float =
1288 InstructionPredicate::new_is_zero_32bit_float(&*formats.unary_ieee32, "imm");
1289 e.enc32_instp(
1290 f32const,
1291 rec_f32imm_z.opcodes(&XORPS),
1292 is_zero_32_bit_float.clone(),
1293 );
1294
1295 let is_zero_64_bit_float =
1296 InstructionPredicate::new_is_zero_64bit_float(&*formats.unary_ieee64, "imm");
1297 e.enc32_instp(
1298 f64const,
1299 rec_f64imm_z.opcodes(&XORPD),
1300 is_zero_64_bit_float.clone(),
1301 );
1302
1303 e.enc_x86_64_instp(f32const, rec_f32imm_z.opcodes(&XORPS), is_zero_32_bit_float);
1304 e.enc_x86_64_instp(f64const, rec_f64imm_z.opcodes(&XORPD), is_zero_64_bit_float);
1305
1306 // cvtsi2ss
1307 e.enc_i32_i64(fcvt_from_sint.bind(F32), rec_frurm.opcodes(&CVTSI2SS));
1308
1309 // cvtsi2sd
1310 e.enc_i32_i64(fcvt_from_sint.bind(F64), rec_frurm.opcodes(&CVTSI2SD));
1311
1312 // cvtss2sd
1313 e.enc_both(fpromote.bind(F64).bind(F32), rec_furm.opcodes(&CVTSS2SD));
1314
1315 // cvtsd2ss
1316 e.enc_both(fdemote.bind(F32).bind(F64), rec_furm.opcodes(&CVTSD2SS));
1317
1318 // cvttss2si
1319 e.enc_both(
1320 x86_cvtt2si.bind(I32).bind(F32),
1321 rec_rfurm.opcodes(&CVTTSS2SI),
1322 );
1323 e.enc64(
1324 x86_cvtt2si.bind(I64).bind(F32),
1325 rec_rfurm.opcodes(&CVTTSS2SI).rex().w(),
1326 );
1327
1328 // cvttsd2si
1329 e.enc_both(
1330 x86_cvtt2si.bind(I32).bind(F64),
1331 rec_rfurm.opcodes(&CVTTSD2SI),
1332 );
1333 e.enc64(
1334 x86_cvtt2si.bind(I64).bind(F64),
1335 rec_rfurm.opcodes(&CVTTSD2SI).rex().w(),
1336 );
1337
1338 // Exact square roots.
1339 e.enc_both(sqrt.bind(F32), rec_furm.opcodes(&SQRTSS));
1340 e.enc_both(sqrt.bind(F64), rec_furm.opcodes(&SQRTSD));
1341
1342 // Rounding. The recipe looks at the opcode to pick an immediate.
1343 for inst in &[nearest, floor, ceil, trunc] {
1344 e.enc_both_isap(inst.bind(F32), rec_furmi_rnd.opcodes(&ROUNDSS), use_sse41);
1345 e.enc_both_isap(inst.bind(F64), rec_furmi_rnd.opcodes(&ROUNDSD), use_sse41);
1346 }
1347
1348 // Binary arithmetic ops.
1349 e.enc_both(fadd.bind(F32), rec_fa.opcodes(&ADDSS));
1350 e.enc_both(fadd.bind(F64), rec_fa.opcodes(&ADDSD));
1351
1352 e.enc_both(fsub.bind(F32), rec_fa.opcodes(&SUBSS));
1353 e.enc_both(fsub.bind(F64), rec_fa.opcodes(&SUBSD));
1354
1355 e.enc_both(fmul.bind(F32), rec_fa.opcodes(&MULSS));
1356 e.enc_both(fmul.bind(F64), rec_fa.opcodes(&MULSD));
1357
1358 e.enc_both(fdiv.bind(F32), rec_fa.opcodes(&DIVSS));
1359 e.enc_both(fdiv.bind(F64), rec_fa.opcodes(&DIVSD));
1360
1361 e.enc_both(x86_fmin.bind(F32), rec_fa.opcodes(&MINSS));
1362 e.enc_both(x86_fmin.bind(F64), rec_fa.opcodes(&MINSD));
1363
1364 e.enc_both(x86_fmax.bind(F32), rec_fa.opcodes(&MAXSS));
1365 e.enc_both(x86_fmax.bind(F64), rec_fa.opcodes(&MAXSD));
1366
1367 // Comparisons.
1368 //
1369 // This only covers the condition codes in `supported_floatccs`, the rest are
1370 // handled by legalization patterns.
1371 e.enc_both(fcmp.bind(F32), rec_fcscc.opcodes(&UCOMISS));
1372 e.enc_both(fcmp.bind(F64), rec_fcscc.opcodes(&UCOMISD));
1373 e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(&UCOMISS));
1374 e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(&UCOMISD));
1375 }
1376
1377 #[inline(never)]
define_alu( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1378 fn define_alu(
1379 e: &mut PerCpuModeEncodings,
1380 shared_defs: &SharedDefinitions,
1381 settings: &SettingGroup,
1382 x86: &InstructionGroup,
1383 r: &RecipeGroup,
1384 ) {
1385 let shared = &shared_defs.instructions;
1386
1387 // Shorthands for instructions.
1388 let clz = shared.by_name("clz");
1389 let ctz = shared.by_name("ctz");
1390 let icmp = shared.by_name("icmp");
1391 let icmp_imm = shared.by_name("icmp_imm");
1392 let ifcmp = shared.by_name("ifcmp");
1393 let ifcmp_imm = shared.by_name("ifcmp_imm");
1394 let ifcmp_sp = shared.by_name("ifcmp_sp");
1395 let ishl = shared.by_name("ishl");
1396 let ishl_imm = shared.by_name("ishl_imm");
1397 let popcnt = shared.by_name("popcnt");
1398 let rotl = shared.by_name("rotl");
1399 let rotl_imm = shared.by_name("rotl_imm");
1400 let rotr = shared.by_name("rotr");
1401 let rotr_imm = shared.by_name("rotr_imm");
1402 let selectif = shared.by_name("selectif");
1403 let selectif_spectre_guard = shared.by_name("selectif_spectre_guard");
1404 let sshr = shared.by_name("sshr");
1405 let sshr_imm = shared.by_name("sshr_imm");
1406 let trueff = shared.by_name("trueff");
1407 let trueif = shared.by_name("trueif");
1408 let ushr = shared.by_name("ushr");
1409 let ushr_imm = shared.by_name("ushr_imm");
1410 let x86_bsf = x86.by_name("x86_bsf");
1411 let x86_bsr = x86.by_name("x86_bsr");
1412
1413 // Shorthands for recipes.
1414 let rec_bsf_and_bsr = r.template("bsf_and_bsr");
1415 let rec_cmov = r.template("cmov");
1416 let rec_icscc = r.template("icscc");
1417 let rec_icscc_ib = r.template("icscc_ib");
1418 let rec_icscc_id = r.template("icscc_id");
1419 let rec_rcmp = r.template("rcmp");
1420 let rec_rcmp_ib = r.template("rcmp_ib");
1421 let rec_rcmp_id = r.template("rcmp_id");
1422 let rec_rcmp_sp = r.template("rcmp_sp");
1423 let rec_rc = r.template("rc");
1424 let rec_setf_abcd = r.template("setf_abcd");
1425 let rec_seti_abcd = r.template("seti_abcd");
1426 let rec_urm = r.template("urm");
1427
1428 // Predicates shorthands.
1429 let use_popcnt = settings.predicate_by_name("use_popcnt");
1430 let use_lzcnt = settings.predicate_by_name("use_lzcnt");
1431 let use_bmi1 = settings.predicate_by_name("use_bmi1");
1432
1433 let band = shared.by_name("band");
1434 let band_imm = shared.by_name("band_imm");
1435 let band_not = shared.by_name("band_not");
1436 let bnot = shared.by_name("bnot");
1437 let bor = shared.by_name("bor");
1438 let bor_imm = shared.by_name("bor_imm");
1439 let bxor = shared.by_name("bxor");
1440 let bxor_imm = shared.by_name("bxor_imm");
1441 let iadd = shared.by_name("iadd");
1442 let iadd_ifcarry = shared.by_name("iadd_ifcarry");
1443 let iadd_ifcin = shared.by_name("iadd_ifcin");
1444 let iadd_ifcout = shared.by_name("iadd_ifcout");
1445 let iadd_imm = shared.by_name("iadd_imm");
1446 let imul = shared.by_name("imul");
1447 let isub = shared.by_name("isub");
1448 let isub_ifbin = shared.by_name("isub_ifbin");
1449 let isub_ifborrow = shared.by_name("isub_ifborrow");
1450 let isub_ifbout = shared.by_name("isub_ifbout");
1451 let x86_sdivmodx = x86.by_name("x86_sdivmodx");
1452 let x86_smulx = x86.by_name("x86_smulx");
1453 let x86_udivmodx = x86.by_name("x86_udivmodx");
1454 let x86_umulx = x86.by_name("x86_umulx");
1455
1456 let rec_div = r.template("div");
1457 let rec_fa = r.template("fa");
1458 let rec_fax = r.template("fax");
1459 let rec_mulx = r.template("mulx");
1460 let rec_r_ib = r.template("r_ib");
1461 let rec_r_id = r.template("r_id");
1462 let rec_rin = r.template("rin");
1463 let rec_rio = r.template("rio");
1464 let rec_rout = r.template("rout");
1465 let rec_rr = r.template("rr");
1466 let rec_rrx = r.template("rrx");
1467 let rec_ur = r.template("ur");
1468
1469 e.enc_i32_i64(iadd, rec_rr.opcodes(&ADD));
1470 e.enc_i32_i64(iadd_ifcout, rec_rout.opcodes(&ADD));
1471 e.enc_i32_i64(iadd_ifcin, rec_rin.opcodes(&ADC));
1472 e.enc_i32_i64(iadd_ifcarry, rec_rio.opcodes(&ADC));
1473 e.enc_i32_i64(iadd_imm, rec_r_ib.opcodes(&ADD_IMM8_SIGN_EXTEND).rrr(0));
1474 e.enc_i32_i64(iadd_imm, rec_r_id.opcodes(&ADD_IMM).rrr(0));
1475
1476 e.enc_i32_i64(isub, rec_rr.opcodes(&SUB));
1477 e.enc_i32_i64(isub_ifbout, rec_rout.opcodes(&SUB));
1478 e.enc_i32_i64(isub_ifbin, rec_rin.opcodes(&SBB));
1479 e.enc_i32_i64(isub_ifborrow, rec_rio.opcodes(&SBB));
1480
1481 e.enc_i32_i64(band, rec_rr.opcodes(&AND));
1482 e.enc_b32_b64(band, rec_rr.opcodes(&AND));
1483
1484 // TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as band_imm.i32. Can
1485 // even use the single-byte immediate for 0xffff_ffXX masks.
1486
1487 e.enc_i32_i64(band_imm, rec_r_ib.opcodes(&AND_IMM8_SIGN_EXTEND).rrr(4));
1488 e.enc_i32_i64(band_imm, rec_r_id.opcodes(&AND_IMM).rrr(4));
1489
1490 e.enc_i32_i64(bor, rec_rr.opcodes(&OR));
1491 e.enc_b32_b64(bor, rec_rr.opcodes(&OR));
1492 e.enc_i32_i64(bor_imm, rec_r_ib.opcodes(&OR_IMM8_SIGN_EXTEND).rrr(1));
1493 e.enc_i32_i64(bor_imm, rec_r_id.opcodes(&OR_IMM).rrr(1));
1494
1495 e.enc_i32_i64(bxor, rec_rr.opcodes(&XOR));
1496 e.enc_b32_b64(bxor, rec_rr.opcodes(&XOR));
1497 e.enc_i32_i64(bxor_imm, rec_r_ib.opcodes(&XOR_IMM8_SIGN_EXTEND).rrr(6));
1498 e.enc_i32_i64(bxor_imm, rec_r_id.opcodes(&XOR_IMM).rrr(6));
1499
1500 // x86 has a bitwise not instruction NOT.
1501 e.enc_i32_i64(bnot, rec_ur.opcodes(&NOT).rrr(2));
1502 e.enc_b32_b64(bnot, rec_ur.opcodes(&NOT).rrr(2));
1503 e.enc_both(bnot.bind(B1), rec_ur.opcodes(&NOT).rrr(2));
1504
1505 // Also add a `b1` encodings for the logic instructions.
1506 // TODO: Should this be done with 8-bit instructions? It would improve partial register
1507 // dependencies.
1508 e.enc_both(band.bind(B1), rec_rr.opcodes(&AND));
1509 e.enc_both(bor.bind(B1), rec_rr.opcodes(&OR));
1510 e.enc_both(bxor.bind(B1), rec_rr.opcodes(&XOR));
1511
1512 e.enc_i32_i64(imul, rec_rrx.opcodes(&IMUL));
1513 e.enc_i32_i64(x86_sdivmodx, rec_div.opcodes(&IDIV).rrr(7));
1514 e.enc_i32_i64(x86_udivmodx, rec_div.opcodes(&DIV).rrr(6));
1515
1516 e.enc_i32_i64(x86_smulx, rec_mulx.opcodes(&IMUL_RDX_RAX).rrr(5));
1517 e.enc_i32_i64(x86_umulx, rec_mulx.opcodes(&MUL).rrr(4));
1518
1519 // Binary bitwise ops.
1520 //
1521 // The F64 version is intentionally encoded using the single-precision opcode:
1522 // the operation is identical and the encoding is one byte shorter.
1523 e.enc_both(band.bind(F32), rec_fa.opcodes(&ANDPS));
1524 e.enc_both(band.bind(F64), rec_fa.opcodes(&ANDPS));
1525
1526 e.enc_both(bor.bind(F32), rec_fa.opcodes(&ORPS));
1527 e.enc_both(bor.bind(F64), rec_fa.opcodes(&ORPS));
1528
1529 e.enc_both(bxor.bind(F32), rec_fa.opcodes(&XORPS));
1530 e.enc_both(bxor.bind(F64), rec_fa.opcodes(&XORPS));
1531
1532 // The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y.
1533 e.enc_both(band_not.bind(F32), rec_fax.opcodes(&ANDNPS));
1534 e.enc_both(band_not.bind(F64), rec_fax.opcodes(&ANDNPS));
1535
1536 // Shifts and rotates.
1537 // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
1538 // and 16-bit shifts would need explicit masking.
1539
1540 for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
1541 // Cannot use enc_i32_i64 for this pattern because instructions require
1542 // to bind any.
1543 e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
1544 e.enc32(
1545 inst.bind(I32).bind(I16),
1546 rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1547 );
1548 e.enc32(
1549 inst.bind(I32).bind(I32),
1550 rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1551 );
1552 e.enc64(
1553 inst.bind(I64).bind(Any),
1554 rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex().w(),
1555 );
1556 e.enc64(
1557 inst.bind(I32).bind(Any),
1558 rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex(),
1559 );
1560 e.enc64(
1561 inst.bind(I32).bind(Any),
1562 rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
1563 );
1564 }
1565
1566 e.enc_i32_i64(rotl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(0));
1567 e.enc_i32_i64(rotr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(1));
1568 e.enc_i32_i64(ishl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(4));
1569 e.enc_i32_i64(ushr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(5));
1570 e.enc_i32_i64(sshr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(7));
1571
1572 // Population count.
1573 e.enc32_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
1574 e.enc64_isap(
1575 popcnt.bind(I64),
1576 rec_urm.opcodes(&POPCNT).rex().w(),
1577 use_popcnt,
1578 );
1579 e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT).rex(), use_popcnt);
1580 e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
1581
1582 // Count leading zero bits.
1583 e.enc32_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
1584 e.enc64_isap(clz.bind(I64), rec_urm.opcodes(&LZCNT).rex().w(), use_lzcnt);
1585 e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT).rex(), use_lzcnt);
1586 e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
1587
1588 // Count trailing zero bits.
1589 e.enc32_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
1590 e.enc64_isap(ctz.bind(I64), rec_urm.opcodes(&TZCNT).rex().w(), use_bmi1);
1591 e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT).rex(), use_bmi1);
1592 e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
1593
1594 // Bit scan forwards and reverse
1595 e.enc_i32_i64(x86_bsf, rec_bsf_and_bsr.opcodes(&BIT_SCAN_FORWARD));
1596 e.enc_i32_i64(x86_bsr, rec_bsf_and_bsr.opcodes(&BIT_SCAN_REVERSE));
1597
1598 // Comparisons
1599 e.enc_i32_i64(icmp, rec_icscc.opcodes(&CMP_REG));
1600 e.enc_i32_i64(icmp_imm, rec_icscc_ib.opcodes(&CMP_IMM8).rrr(7));
1601 e.enc_i32_i64(icmp_imm, rec_icscc_id.opcodes(&CMP_IMM).rrr(7));
1602 e.enc_i32_i64(ifcmp, rec_rcmp.opcodes(&CMP_REG));
1603 e.enc_i32_i64(ifcmp_imm, rec_rcmp_ib.opcodes(&CMP_IMM8).rrr(7));
1604 e.enc_i32_i64(ifcmp_imm, rec_rcmp_id.opcodes(&CMP_IMM).rrr(7));
1605 // TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x).
1606
1607 e.enc32(ifcmp_sp.bind(I32), rec_rcmp_sp.opcodes(&CMP_REG));
1608 e.enc64(ifcmp_sp.bind(I64), rec_rcmp_sp.opcodes(&CMP_REG).rex().w());
1609
1610 // Convert flags to bool.
1611 // This encodes `b1` as an 8-bit low register with the value 0 or 1.
1612 e.enc_both(trueif, rec_seti_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
1613 e.enc_both(trueff, rec_setf_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
1614
1615 // Conditional move (a.k.a integer select).
1616 e.enc_i32_i64(selectif, rec_cmov.opcodes(&CMOV_OVERFLOW));
1617 // A Spectre-guard integer select is exactly the same as a selectif, but
1618 // is not associated with any other legalization rules and is not
1619 // recognized by any optimizations, so it must arrive here unmodified
1620 // and in its original place.
1621 e.enc_i32_i64(selectif_spectre_guard, rec_cmov.opcodes(&CMOV_OVERFLOW));
1622 }
1623
1624 #[inline(never)]
1625 #[allow(clippy::cognitive_complexity)]
define_simd( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, )1626 fn define_simd(
1627 e: &mut PerCpuModeEncodings,
1628 shared_defs: &SharedDefinitions,
1629 settings: &SettingGroup,
1630 x86: &InstructionGroup,
1631 r: &RecipeGroup,
1632 ) {
1633 let shared = &shared_defs.instructions;
1634 let formats = &shared_defs.formats;
1635
1636 // Shorthands for instructions.
1637 let avg_round = shared.by_name("avg_round");
1638 let bitcast = shared.by_name("bitcast");
1639 let bor = shared.by_name("bor");
1640 let bxor = shared.by_name("bxor");
1641 let copy = shared.by_name("copy");
1642 let copy_nop = shared.by_name("copy_nop");
1643 let copy_to_ssa = shared.by_name("copy_to_ssa");
1644 let fadd = shared.by_name("fadd");
1645 let fcmp = shared.by_name("fcmp");
1646 let fcvt_from_sint = shared.by_name("fcvt_from_sint");
1647 let fdiv = shared.by_name("fdiv");
1648 let fill = shared.by_name("fill");
1649 let fill_nop = shared.by_name("fill_nop");
1650 let fmul = shared.by_name("fmul");
1651 let fsub = shared.by_name("fsub");
1652 let iabs = shared.by_name("iabs");
1653 let iadd = shared.by_name("iadd");
1654 let icmp = shared.by_name("icmp");
1655 let imul = shared.by_name("imul");
1656 let ishl_imm = shared.by_name("ishl_imm");
1657 let load = shared.by_name("load");
1658 let load_complex = shared.by_name("load_complex");
1659 let raw_bitcast = shared.by_name("raw_bitcast");
1660 let regfill = shared.by_name("regfill");
1661 let regmove = shared.by_name("regmove");
1662 let regspill = shared.by_name("regspill");
1663 let sadd_sat = shared.by_name("sadd_sat");
1664 let scalar_to_vector = shared.by_name("scalar_to_vector");
1665 let sload8x8 = shared.by_name("sload8x8");
1666 let sload8x8_complex = shared.by_name("sload8x8_complex");
1667 let sload16x4 = shared.by_name("sload16x4");
1668 let sload16x4_complex = shared.by_name("sload16x4_complex");
1669 let sload32x2 = shared.by_name("sload32x2");
1670 let sload32x2_complex = shared.by_name("sload32x2_complex");
1671 let spill = shared.by_name("spill");
1672 let sqrt = shared.by_name("sqrt");
1673 let sshr_imm = shared.by_name("sshr_imm");
1674 let ssub_sat = shared.by_name("ssub_sat");
1675 let store = shared.by_name("store");
1676 let store_complex = shared.by_name("store_complex");
1677 let swiden_low = shared.by_name("swiden_low");
1678 let uadd_sat = shared.by_name("uadd_sat");
1679 let uload8x8 = shared.by_name("uload8x8");
1680 let uload8x8_complex = shared.by_name("uload8x8_complex");
1681 let uload16x4 = shared.by_name("uload16x4");
1682 let uload16x4_complex = shared.by_name("uload16x4_complex");
1683 let uload32x2 = shared.by_name("uload32x2");
1684 let uload32x2_complex = shared.by_name("uload32x2_complex");
1685 let snarrow = shared.by_name("snarrow");
1686 let unarrow = shared.by_name("unarrow");
1687 let uwiden_low = shared.by_name("uwiden_low");
1688 let ushr_imm = shared.by_name("ushr_imm");
1689 let usub_sat = shared.by_name("usub_sat");
1690 let vconst = shared.by_name("vconst");
1691 let vselect = shared.by_name("vselect");
1692 let widening_pairwise_dot_product_s = shared.by_name("widening_pairwise_dot_product_s");
1693 let x86_cvtt2si = x86.by_name("x86_cvtt2si");
1694 let x86_insertps = x86.by_name("x86_insertps");
1695 let x86_fmax = x86.by_name("x86_fmax");
1696 let x86_fmin = x86.by_name("x86_fmin");
1697 let x86_movlhps = x86.by_name("x86_movlhps");
1698 let x86_movsd = x86.by_name("x86_movsd");
1699 let x86_pblendw = x86.by_name("x86_pblendw");
1700 let x86_pextr = x86.by_name("x86_pextr");
1701 let x86_pinsr = x86.by_name("x86_pinsr");
1702 let x86_pmaxs = x86.by_name("x86_pmaxs");
1703 let x86_pmaxu = x86.by_name("x86_pmaxu");
1704 let x86_pmins = x86.by_name("x86_pmins");
1705 let x86_pminu = x86.by_name("x86_pminu");
1706 let x86_pmullq = x86.by_name("x86_pmullq");
1707 let x86_pmuludq = x86.by_name("x86_pmuludq");
1708 let x86_palignr = x86.by_name("x86_palignr");
1709 let x86_pshufb = x86.by_name("x86_pshufb");
1710 let x86_pshufd = x86.by_name("x86_pshufd");
1711 let x86_psll = x86.by_name("x86_psll");
1712 let x86_psra = x86.by_name("x86_psra");
1713 let x86_psrl = x86.by_name("x86_psrl");
1714 let x86_ptest = x86.by_name("x86_ptest");
1715 let x86_punpckh = x86.by_name("x86_punpckh");
1716 let x86_punpckl = x86.by_name("x86_punpckl");
1717 let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");
1718
1719 // Shorthands for recipes.
1720 let rec_blend = r.template("blend");
1721 let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
1722 let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
1723 let rec_f_ib = r.template("f_ib");
1724 let rec_fa = r.template("fa");
1725 let rec_fa_ib = r.template("fa_ib");
1726 let rec_fax = r.template("fax");
1727 let rec_fcmp = r.template("fcmp");
1728 let rec_ffillSib32 = r.template("ffillSib32");
1729 let rec_ffillnull = r.recipe("ffillnull");
1730 let rec_fld = r.template("fld");
1731 let rec_fldDisp32 = r.template("fldDisp32");
1732 let rec_fldDisp8 = r.template("fldDisp8");
1733 let rec_fldWithIndex = r.template("fldWithIndex");
1734 let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
1735 let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
1736 let rec_fregfill32 = r.template("fregfill32");
1737 let rec_fregspill32 = r.template("fregspill32");
1738 let rec_frmov = r.template("frmov");
1739 let rec_frurm = r.template("frurm");
1740 let rec_fspillSib32 = r.template("fspillSib32");
1741 let rec_fst = r.template("fst");
1742 let rec_fstDisp32 = r.template("fstDisp32");
1743 let rec_fstDisp8 = r.template("fstDisp8");
1744 let rec_fstWithIndex = r.template("fstWithIndex");
1745 let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
1746 let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
1747 let rec_furm = r.template("furm");
1748 let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
1749 let rec_icscc_fpr = r.template("icscc_fpr");
1750 let rec_null_fpr = r.recipe("null_fpr");
1751 let rec_pfcmp = r.template("pfcmp");
1752 let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr");
1753 let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr");
1754 let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
1755 let rec_stacknull = r.recipe("stacknull");
1756 let rec_vconst = r.template("vconst");
1757 let rec_vconst_optimized = r.template("vconst_optimized");
1758
1759 // Predicates shorthands.
1760 settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
1761 settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
1762 let use_ssse3_simd = settings.predicate_by_name("use_ssse3_simd");
1763 let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
1764 let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
1765 let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
1766 let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");
1767
1768 // SIMD vector size: eventually multiple vector sizes may be supported but for now only
1769 // SSE-sized vectors are available.
1770 let sse_vector_size: u64 = 128;
1771
1772 // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
1773 // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
1774 // value across the register.
1775
1776 let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
1777
1778 // PSHUFB, 8-bit shuffle using two XMM registers.
1779 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1780 let instruction = x86_pshufb.bind(vector(ty, sse_vector_size));
1781 let template = rec_fa.opcodes(&PSHUFB);
1782 e.enc_both_inferred_maybe_isap(instruction.clone(), template.clone(), Some(use_ssse3_simd));
1783 }
1784
1785 // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
1786 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
1787 let instruction = x86_pshufd.bind(vector(ty, sse_vector_size));
1788 let template = rec_r_ib_unsigned_fpr.opcodes(&PSHUFD);
1789 e.enc_both_inferred(instruction, template);
1790 }
1791
1792 // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
1793 // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
1794 // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
1795 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1796 let opcode = match ty.lane_bits() {
1797 32 => &BLENDVPS,
1798 64 => &BLENDVPD,
1799 _ => &PBLENDVB,
1800 };
1801 let instruction = vselect.bind(vector(ty, sse_vector_size));
1802 let template = rec_blend.opcodes(opcode);
1803 e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1804 }
1805
1806 // PBLENDW, select lanes using a u8 immediate.
1807 for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
1808 let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
1809 let template = rec_fa_ib.opcodes(&PBLENDW);
1810 e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1811 }
1812
1813 // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
1814 // to the Intel manual: "When the destination operand is an XMM register, the source operand is
1815 // written to the low doubleword of the register and the register is zero-extended to 128 bits."
1816 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1817 let instruction = scalar_to_vector.bind(vector(ty, sse_vector_size));
1818 if ty.is_float() {
1819 // No need to move floats--they already live in XMM registers.
1820 e.enc_32_64_rec(instruction, rec_null_fpr, 0);
1821 } else {
1822 let template = rec_frurm.opcodes(&MOVD_LOAD_XMM);
1823 if ty.lane_bits() < 64 {
1824 e.enc_both_inferred(instruction, template);
1825 } else {
1826 // No 32-bit encodings for 64-bit widths.
1827 assert_eq!(ty.lane_bits(), 64);
1828 e.enc64(instruction, template.rex().w());
1829 }
1830 }
1831 }
1832
1833 // SIMD insertlane
1834 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1835 let (opcode, isap): (&[_], _) = match ty.lane_bits() {
1836 8 => (&PINSRB, Some(use_sse41_simd)),
1837 16 => (&PINSRW, None),
1838 32 | 64 => (&PINSR, Some(use_sse41_simd)),
1839 _ => panic!("invalid size for SIMD insertlane"),
1840 };
1841
1842 let instruction = x86_pinsr.bind(vector(ty, sse_vector_size));
1843 let template = rec_r_ib_unsigned_r.opcodes(opcode);
1844 if ty.lane_bits() < 64 {
1845 e.enc_both_inferred_maybe_isap(instruction, template, isap);
1846 } else {
1847 // It turns out the 64-bit widths have REX/W encodings and only are available on
1848 // x86_64.
1849 e.enc64_maybe_isap(instruction, template.rex().w(), isap);
1850 }
1851 }
1852
1853 // For legalizing insertlane with floats, INSERTPS from SSE4.1.
1854 {
1855 let instruction = x86_insertps.bind(vector(F32, sse_vector_size));
1856 let template = rec_fa_ib.opcodes(&INSERTPS);
1857 e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1858 }
1859
1860 // For legalizing insertlane with floats, MOVSD from SSE2.
1861 {
1862 let instruction = x86_movsd.bind(vector(F64, sse_vector_size));
1863 let template = rec_fa.opcodes(&MOVSD_LOAD);
1864 e.enc_both_inferred(instruction, template); // from SSE2
1865 }
1866
1867 // For legalizing insertlane with floats, MOVLHPS from SSE.
1868 {
1869 let instruction = x86_movlhps.bind(vector(F64, sse_vector_size));
1870 let template = rec_fa.opcodes(&MOVLHPS);
1871 e.enc_both_inferred(instruction, template); // from SSE
1872 }
1873
1874 // SIMD extractlane
1875 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1876 let opcode = match ty.lane_bits() {
1877 8 => &PEXTRB,
1878 16 => &PEXTRW,
1879 32 | 64 => &PEXTR,
1880 _ => panic!("invalid size for SIMD extractlane"),
1881 };
1882
1883 let instruction = x86_pextr.bind(vector(ty, sse_vector_size));
1884 let template = rec_r_ib_unsigned_gpr.opcodes(opcode);
1885 if ty.lane_bits() < 64 {
1886 e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
1887 } else {
1888 // It turns out the 64-bit widths have REX/W encodings and only are available on
1889 // x86_64.
1890 e.enc64_maybe_isap(instruction, template.rex().w(), Some(use_sse41_simd));
1891 }
1892 }
1893
1894 // SIMD packing/unpacking
1895 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
1896 let (high, low) = match ty.lane_bits() {
1897 8 => (&PUNPCKHBW, &PUNPCKLBW),
1898 16 => (&PUNPCKHWD, &PUNPCKLWD),
1899 32 => (&PUNPCKHDQ, &PUNPCKLDQ),
1900 64 => (&PUNPCKHQDQ, &PUNPCKLQDQ),
1901 _ => panic!("invalid size for SIMD packing/unpacking"),
1902 };
1903
1904 e.enc_both_inferred(
1905 x86_punpckh.bind(vector(ty, sse_vector_size)),
1906 rec_fa.opcodes(high),
1907 );
1908 e.enc_both_inferred(
1909 x86_punpckl.bind(vector(ty, sse_vector_size)),
1910 rec_fa.opcodes(low),
1911 );
1912 }
1913
1914 // SIMD narrow/widen
1915 for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
1916 let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
1917 e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
1918 }
1919 for (ty, opcodes, isap) in &[
1920 (I16, &PACKUSWB[..], None),
1921 (I32, &PACKUSDW[..], Some(use_sse41_simd)),
1922 ] {
1923 let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
1924 e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
1925 }
1926 for (ty, swiden_opcode, uwiden_opcode) in &[
1927 (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
1928 (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
1929 ] {
1930 let isap = Some(use_sse41_simd);
1931 let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
1932 e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
1933 let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
1934 e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
1935 }
1936 for ty in &[I8, I16, I32, I64] {
1937 e.enc_both_inferred_maybe_isap(
1938 x86_palignr.bind(vector(*ty, sse_vector_size)),
1939 rec_fa_ib.opcodes(&PALIGNR[..]),
1940 Some(use_ssse3_simd),
1941 );
1942 }
1943
1944 // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
1945 for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
1946 for to_type in
1947 ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
1948 {
1949 let instruction = raw_bitcast
1950 .bind(vector(to_type, sse_vector_size))
1951 .bind(vector(from_type, sse_vector_size));
1952 e.enc_32_64_rec(instruction, rec_null_fpr, 0);
1953 }
1954 }
1955
1956 // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
1957 // XMM register.
1958 for float_type in &[F32, F64] {
1959 for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
1960 e.enc_32_64_rec(
1961 raw_bitcast
1962 .bind(vector(lane_type, sse_vector_size))
1963 .bind(*float_type),
1964 rec_null_fpr,
1965 0,
1966 );
1967 e.enc_32_64_rec(
1968 raw_bitcast
1969 .bind(*float_type)
1970 .bind(vector(lane_type, sse_vector_size)),
1971 rec_null_fpr,
1972 0,
1973 );
1974 }
1975 }
1976
1977 // SIMD conversions
1978 {
1979 let fcvt_from_sint_32 = fcvt_from_sint
1980 .bind(vector(F32, sse_vector_size))
1981 .bind(vector(I32, sse_vector_size));
1982 e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));
1983
1984 e.enc_32_64_maybe_isap(
1985 x86_vcvtudq2ps,
1986 rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
1987 Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
1988 );
1989
1990 e.enc_both_inferred(
1991 x86_cvtt2si
1992 .bind(vector(I32, sse_vector_size))
1993 .bind(vector(F32, sse_vector_size)),
1994 rec_furm.opcodes(&CVTTPS2DQ),
1995 );
1996 }
1997
1998 // SIMD vconst for special cases (all zeroes, all ones)
1999 // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
2000 // encoding first
2001 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2002 let instruction = vconst.bind(vector(ty, sse_vector_size));
2003
2004 let is_zero_128bit =
2005 InstructionPredicate::new_is_all_zeroes(&*formats.unary_const, "constant_handle");
2006 let template = rec_vconst_optimized.opcodes(&PXOR).infer_rex();
2007 e.enc_32_64_func(instruction.clone(), template, |builder| {
2008 builder.inst_predicate(is_zero_128bit)
2009 });
2010
2011 let is_ones_128bit =
2012 InstructionPredicate::new_is_all_ones(&*formats.unary_const, "constant_handle");
2013 let template = rec_vconst_optimized.opcodes(&PCMPEQB).infer_rex();
2014 e.enc_32_64_func(instruction, template, |builder| {
2015 builder.inst_predicate(is_ones_128bit)
2016 });
2017 }
2018
2019 // SIMD vconst using MOVUPS
2020 // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
2021 // to guarantee that the constants are aligned when emitted and there is currently no mechanism
2022 // for that; alternately, constants could be loaded into XMM registers using a sequence like:
2023 // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored
2024 // in memory) but some performance measurements are needed.
2025 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2026 let instruction = vconst.bind(vector(ty, sse_vector_size));
2027 let template = rec_vconst.opcodes(&MOVUPS_LOAD);
2028 e.enc_both_inferred(instruction, template); // from SSE
2029 }
2030
2031 // SIMD register movement: store, load, spill, fill, regmove, etc. All of these use encodings of
2032 // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have
2033 // alignment or type-specific encodings, see https://github.com/bytecodealliance/wasmtime/issues/1124).
2034 // Also, it would be ideal to infer REX prefixes for all of these instructions but for the
2035 // time being only instructions with common recipes have `infer_rex()` support.
2036 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2037 // Store
2038 let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any);
2039 e.enc_both_inferred(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE));
2040 e.enc_both_inferred(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE));
2041 e.enc_both_inferred(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE));
2042
2043 // Store complex
2044 let bound_store_complex = store_complex.bind(vector(ty, sse_vector_size));
2045 e.enc_both(
2046 bound_store_complex.clone(),
2047 rec_fstWithIndex.opcodes(&MOVUPS_STORE),
2048 );
2049 e.enc_both(
2050 bound_store_complex.clone(),
2051 rec_fstWithIndexDisp8.opcodes(&MOVUPS_STORE),
2052 );
2053 e.enc_both(
2054 bound_store_complex,
2055 rec_fstWithIndexDisp32.opcodes(&MOVUPS_STORE),
2056 );
2057
2058 // Load
2059 let bound_load = load.bind(vector(ty, sse_vector_size)).bind(Any);
2060 e.enc_both_inferred(bound_load.clone(), rec_fld.opcodes(&MOVUPS_LOAD));
2061 e.enc_both_inferred(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD));
2062 e.enc_both_inferred(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD));
2063
2064 // Load complex
2065 let bound_load_complex = load_complex.bind(vector(ty, sse_vector_size));
2066 e.enc_both(
2067 bound_load_complex.clone(),
2068 rec_fldWithIndex.opcodes(&MOVUPS_LOAD),
2069 );
2070 e.enc_both(
2071 bound_load_complex.clone(),
2072 rec_fldWithIndexDisp8.opcodes(&MOVUPS_LOAD),
2073 );
2074 e.enc_both(
2075 bound_load_complex,
2076 rec_fldWithIndexDisp32.opcodes(&MOVUPS_LOAD),
2077 );
2078
2079 // Spill
2080 let bound_spill = spill.bind(vector(ty, sse_vector_size));
2081 e.enc_both(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE));
2082 let bound_regspill = regspill.bind(vector(ty, sse_vector_size));
2083 e.enc_both(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE));
2084
2085 // Fill
2086 let bound_fill = fill.bind(vector(ty, sse_vector_size));
2087 e.enc_both(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD));
2088 let bound_regfill = regfill.bind(vector(ty, sse_vector_size));
2089 e.enc_both(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD));
2090 let bound_fill_nop = fill_nop.bind(vector(ty, sse_vector_size));
2091 e.enc_32_64_rec(bound_fill_nop, rec_ffillnull, 0);
2092
2093 // Regmove
2094 let bound_regmove = regmove.bind(vector(ty, sse_vector_size));
2095 e.enc_both(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD));
2096
2097 // Copy
2098 let bound_copy = copy.bind(vector(ty, sse_vector_size));
2099 e.enc_both(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD));
2100 let bound_copy_to_ssa = copy_to_ssa.bind(vector(ty, sse_vector_size));
2101 e.enc_both(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD));
2102 let bound_copy_nop = copy_nop.bind(vector(ty, sse_vector_size));
2103 e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
2104 }
2105
2106 // SIMD load extend
2107 for (inst, opcodes) in &[
2108 (uload8x8, &PMOVZXBW),
2109 (uload16x4, &PMOVZXWD),
2110 (uload32x2, &PMOVZXDQ),
2111 (sload8x8, &PMOVSXBW),
2112 (sload16x4, &PMOVSXWD),
2113 (sload32x2, &PMOVSXDQ),
2114 ] {
2115 let isap = Some(use_sse41_simd);
2116 for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
2117 let inst = *inst;
2118 let template = recipe.opcodes(*opcodes);
2119 e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
2120 e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap);
2121 }
2122 }
2123
2124 // SIMD load extend (complex addressing)
2125 let is_load_complex_length_two =
2126 InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
2127 for (inst, opcodes) in &[
2128 (uload8x8_complex, &PMOVZXBW),
2129 (uload16x4_complex, &PMOVZXWD),
2130 (uload32x2_complex, &PMOVZXDQ),
2131 (sload8x8_complex, &PMOVSXBW),
2132 (sload16x4_complex, &PMOVSXWD),
2133 (sload32x2_complex, &PMOVSXDQ),
2134 ] {
2135 for recipe in &[
2136 rec_fldWithIndex,
2137 rec_fldWithIndexDisp8,
2138 rec_fldWithIndexDisp32,
2139 ] {
2140 let template = recipe.opcodes(*opcodes);
2141 let predicate = |encoding: EncodingBuilder| {
2142 encoding
2143 .isa_predicate(use_sse41_simd)
2144 .inst_predicate(is_load_complex_length_two.clone())
2145 };
2146 e.enc32_func(inst.clone(), template.clone(), predicate);
2147 // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64.
2148 e.enc64_func(inst.clone(), template.rex(), predicate);
2149 e.enc64_func(inst.clone(), template, predicate);
2150 }
2151 }
2152
2153 // SIMD integer addition
2154 for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
2155 let iadd = iadd.bind(vector(*ty, sse_vector_size));
2156 e.enc_both_inferred(iadd, rec_fa.opcodes(*opcodes));
2157 }
2158
2159 // SIMD integer saturating addition
2160 e.enc_both_inferred(
2161 sadd_sat.bind(vector(I8, sse_vector_size)),
2162 rec_fa.opcodes(&PADDSB),
2163 );
2164 e.enc_both_inferred(
2165 sadd_sat.bind(vector(I16, sse_vector_size)),
2166 rec_fa.opcodes(&PADDSW),
2167 );
2168 e.enc_both_inferred(
2169 uadd_sat.bind(vector(I8, sse_vector_size)),
2170 rec_fa.opcodes(&PADDUSB),
2171 );
2172 e.enc_both_inferred(
2173 uadd_sat.bind(vector(I16, sse_vector_size)),
2174 rec_fa.opcodes(&PADDUSW),
2175 );
2176
2177 // SIMD integer subtraction
2178 let isub = shared.by_name("isub");
2179 for (ty, opcodes) in &[(I8, &PSUBB), (I16, &PSUBW), (I32, &PSUBD), (I64, &PSUBQ)] {
2180 let isub = isub.bind(vector(*ty, sse_vector_size));
2181 e.enc_both_inferred(isub, rec_fa.opcodes(*opcodes));
2182 }
2183
2184 // SIMD integer saturating subtraction
2185 e.enc_both_inferred(
2186 ssub_sat.bind(vector(I8, sse_vector_size)),
2187 rec_fa.opcodes(&PSUBSB),
2188 );
2189 e.enc_both_inferred(
2190 ssub_sat.bind(vector(I16, sse_vector_size)),
2191 rec_fa.opcodes(&PSUBSW),
2192 );
2193 e.enc_both_inferred(
2194 usub_sat.bind(vector(I8, sse_vector_size)),
2195 rec_fa.opcodes(&PSUBUSB),
2196 );
2197 e.enc_both_inferred(
2198 usub_sat.bind(vector(I16, sse_vector_size)),
2199 rec_fa.opcodes(&PSUBUSW),
2200 );
2201
2202 // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
2203 // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
2204 for (ty, opcodes, isap) in &[
2205 (I16, &PMULLW[..], None),
2206 (I32, &PMULLD[..], Some(use_sse41_simd)),
2207 ] {
2208 let imul = imul.bind(vector(*ty, sse_vector_size));
2209 e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
2210 }
2211
2212 // SIMD multiplication with lane expansion.
2213 e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
2214
2215 // SIMD multiplication and add adjacent pairs, from SSE2.
2216 e.enc_both_inferred(widening_pairwise_dot_product_s, rec_fa.opcodes(&PMADDWD));
2217
2218 // SIMD integer multiplication for I64x2 using a AVX512.
2219 {
2220 e.enc_32_64_maybe_isap(
2221 x86_pmullq,
2222 rec_evex_reg_vvvv_rm_128.opcodes(&VPMULLQ).w(),
2223 Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
2224 );
2225 }
2226
2227 // SIMD integer average with rounding.
2228 for (ty, opcodes) in &[(I8, &PAVGB[..]), (I16, &PAVGW[..])] {
2229 let avgr = avg_round.bind(vector(*ty, sse_vector_size));
2230 e.enc_both_inferred(avgr, rec_fa.opcodes(opcodes));
2231 }
2232
2233 // SIMD integer absolute value.
2234 for (ty, opcodes) in &[(I8, &PABSB[..]), (I16, &PABSW[..]), (I32, &PABSD)] {
2235 let iabs = iabs.bind(vector(*ty, sse_vector_size));
2236 e.enc_both_inferred_maybe_isap(iabs, rec_furm.opcodes(opcodes), Some(use_ssse3_simd));
2237 }
2238
2239 // SIMD logical operations
2240 let band = shared.by_name("band");
2241 let band_not = shared.by_name("band_not");
2242 for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
2243 // and
2244 let band = band.bind(vector(ty, sse_vector_size));
2245 e.enc_both_inferred(band, rec_fa.opcodes(&PAND));
2246
2247 // and not (note flipped recipe operands to match band_not order)
2248 let band_not = band_not.bind(vector(ty, sse_vector_size));
2249 e.enc_both_inferred(band_not, rec_fax.opcodes(&PANDN));
2250
2251 // or
2252 let bor = bor.bind(vector(ty, sse_vector_size));
2253 e.enc_both_inferred(bor, rec_fa.opcodes(&POR));
2254
2255 // xor
2256 let bxor = bxor.bind(vector(ty, sse_vector_size));
2257 e.enc_both_inferred(bxor, rec_fa.opcodes(&PXOR));
2258
2259 // ptest
2260 let x86_ptest = x86_ptest.bind(vector(ty, sse_vector_size));
2261 e.enc_both_inferred_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd));
2262 }
2263
2264 // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
2265 // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
2266 // I128x1 but restrictions on the type builder prevent this; the general idea here is that
2267 // the upper bits are all zeroed and do not form parts of any separate lane. See
2268 // https://github.com/bytecodealliance/wasmtime/issues/1140.
2269 e.enc_both_inferred(
2270 bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
2271 rec_frurm.opcodes(&MOVD_LOAD_XMM),
2272 );
2273 e.enc64(
2274 bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
2275 rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
2276 );
2277
2278 // SIMD shift left
2279 for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
2280 let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
2281 e.enc_both_inferred(x86_psll, rec_fa.opcodes(*opcodes));
2282 }
2283
2284 // SIMD shift right (logical)
2285 for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] {
2286 let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size));
2287 e.enc_both_inferred(x86_psrl, rec_fa.opcodes(*opcodes));
2288 }
2289
2290 // SIMD shift right (arithmetic)
2291 for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] {
2292 let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size));
2293 e.enc_both_inferred(x86_psra, rec_fa.opcodes(*opcodes));
2294 }
2295
2296 // SIMD immediate shift
2297 for (ty, opcodes) in &[(I16, &PS_W_IMM), (I32, &PS_D_IMM), (I64, &PS_Q_IMM)] {
2298 let ishl_imm = ishl_imm.bind(vector(*ty, sse_vector_size));
2299 e.enc_both_inferred(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6));
2300
2301 let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size));
2302 e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2));
2303
2304 // One exception: PSRAQ does not exist in for 64x2 in SSE2, it requires a higher CPU feature set.
2305 if *ty != I64 {
2306 let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size));
2307 e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4));
2308 }
2309 }
2310
2311 // SIMD integer comparisons
2312 {
2313 use IntCC::*;
2314 for (ty, cc, opcodes, isa_predicate) in &[
2315 (I8, Equal, &PCMPEQB[..], None),
2316 (I16, Equal, &PCMPEQW[..], None),
2317 (I32, Equal, &PCMPEQD[..], None),
2318 (I64, Equal, &PCMPEQQ[..], Some(use_sse41_simd)),
2319 (I8, SignedGreaterThan, &PCMPGTB[..], None),
2320 (I16, SignedGreaterThan, &PCMPGTW[..], None),
2321 (I32, SignedGreaterThan, &PCMPGTD[..], None),
2322 (I64, SignedGreaterThan, &PCMPGTQ, Some(use_sse42_simd)),
2323 ] {
2324 let instruction = icmp
2325 .bind(Immediate::IntCC(*cc))
2326 .bind(vector(*ty, sse_vector_size));
2327 let template = rec_icscc_fpr.opcodes(opcodes);
2328 e.enc_both_inferred_maybe_isap(instruction, template, *isa_predicate);
2329 }
2330 }
2331
2332 // SIMD min/max
2333 for (ty, inst, opcodes, isa_predicate) in &[
2334 (I8, x86_pmaxs, &PMAXSB[..], Some(use_sse41_simd)),
2335 (I16, x86_pmaxs, &PMAXSW[..], None),
2336 (I32, x86_pmaxs, &PMAXSD[..], Some(use_sse41_simd)),
2337 (I8, x86_pmaxu, &PMAXUB[..], None),
2338 (I16, x86_pmaxu, &PMAXUW[..], Some(use_sse41_simd)),
2339 (I32, x86_pmaxu, &PMAXUD[..], Some(use_sse41_simd)),
2340 (I8, x86_pmins, &PMINSB[..], Some(use_sse41_simd)),
2341 (I16, x86_pmins, &PMINSW[..], None),
2342 (I32, x86_pmins, &PMINSD[..], Some(use_sse41_simd)),
2343 (I8, x86_pminu, &PMINUB[..], None),
2344 (I16, x86_pminu, &PMINUW[..], Some(use_sse41_simd)),
2345 (I32, x86_pminu, &PMINUD[..], Some(use_sse41_simd)),
2346 ] {
2347 let inst = inst.bind(vector(*ty, sse_vector_size));
2348 e.enc_both_inferred_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate);
2349 }
2350
2351 // SIMD float comparisons
2352 e.enc_both_inferred(
2353 fcmp.bind(vector(F32, sse_vector_size)),
2354 rec_pfcmp.opcodes(&CMPPS),
2355 );
2356 e.enc_both_inferred(
2357 fcmp.bind(vector(F64, sse_vector_size)),
2358 rec_pfcmp.opcodes(&CMPPD),
2359 );
2360
2361 // SIMD float arithmetic
2362 for (ty, inst, opcodes) in &[
2363 (F32, fadd, &ADDPS[..]),
2364 (F64, fadd, &ADDPD[..]),
2365 (F32, fsub, &SUBPS[..]),
2366 (F64, fsub, &SUBPD[..]),
2367 (F32, fmul, &MULPS[..]),
2368 (F64, fmul, &MULPD[..]),
2369 (F32, fdiv, &DIVPS[..]),
2370 (F64, fdiv, &DIVPD[..]),
2371 (F32, x86_fmin, &MINPS[..]),
2372 (F64, x86_fmin, &MINPD[..]),
2373 (F32, x86_fmax, &MAXPS[..]),
2374 (F64, x86_fmax, &MAXPD[..]),
2375 ] {
2376 let inst = inst.bind(vector(*ty, sse_vector_size));
2377 e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
2378 }
2379 for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] {
2380 let inst = inst.bind(vector(*ty, sse_vector_size));
2381 e.enc_both_inferred(inst, rec_furm.opcodes(opcodes));
2382 }
2383 }
2384
2385 #[inline(never)]
define_entity_ref( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, r: &RecipeGroup, )2386 fn define_entity_ref(
2387 e: &mut PerCpuModeEncodings,
2388 shared_defs: &SharedDefinitions,
2389 settings: &SettingGroup,
2390 r: &RecipeGroup,
2391 ) {
2392 let shared = &shared_defs.instructions;
2393 let formats = &shared_defs.formats;
2394
2395 // Shorthands for instructions.
2396 let const_addr = shared.by_name("const_addr");
2397 let func_addr = shared.by_name("func_addr");
2398 let stack_addr = shared.by_name("stack_addr");
2399 let symbol_value = shared.by_name("symbol_value");
2400
2401 // Shorthands for recipes.
2402 let rec_allones_fnaddr4 = r.template("allones_fnaddr4");
2403 let rec_allones_fnaddr8 = r.template("allones_fnaddr8");
2404 let rec_fnaddr4 = r.template("fnaddr4");
2405 let rec_fnaddr8 = r.template("fnaddr8");
2406 let rec_const_addr = r.template("const_addr");
2407 let rec_got_fnaddr8 = r.template("got_fnaddr8");
2408 let rec_got_gvaddr8 = r.template("got_gvaddr8");
2409 let rec_gvaddr4 = r.template("gvaddr4");
2410 let rec_gvaddr8 = r.template("gvaddr8");
2411 let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
2412 let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
2413 let rec_spaddr_id = r.template("spaddr_id");
2414
2415 // Predicates shorthands.
2416 let all_ones_funcaddrs_and_not_is_pic =
2417 settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
2418 let is_pic = settings.predicate_by_name("is_pic");
2419 let not_all_ones_funcaddrs_and_not_is_pic =
2420 settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
2421 let not_is_pic = settings.predicate_by_name("not_is_pic");
2422
2423 // Function addresses.
2424
2425 // Non-PIC, all-ones funcaddresses.
2426 e.enc32_isap(
2427 func_addr.bind(I32),
2428 rec_fnaddr4.opcodes(&MOV_IMM),
2429 not_all_ones_funcaddrs_and_not_is_pic,
2430 );
2431 e.enc64_isap(
2432 func_addr.bind(I64),
2433 rec_fnaddr8.opcodes(&MOV_IMM).rex().w(),
2434 not_all_ones_funcaddrs_and_not_is_pic,
2435 );
2436
2437 // Non-PIC, all-zeros funcaddresses.
2438 e.enc32_isap(
2439 func_addr.bind(I32),
2440 rec_allones_fnaddr4.opcodes(&MOV_IMM),
2441 all_ones_funcaddrs_and_not_is_pic,
2442 );
2443 e.enc64_isap(
2444 func_addr.bind(I64),
2445 rec_allones_fnaddr8.opcodes(&MOV_IMM).rex().w(),
2446 all_ones_funcaddrs_and_not_is_pic,
2447 );
2448
2449 // 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's pc-relative field.
2450 let is_colocated_func =
2451 InstructionPredicate::new_is_colocated_func(&*formats.func_addr, "func_ref");
2452 e.enc64_instp(
2453 func_addr.bind(I64),
2454 rec_pcrel_fnaddr8.opcodes(&LEA).rex().w(),
2455 is_colocated_func,
2456 );
2457
2458 // 64-bit, non-colocated, PIC.
2459 e.enc64_isap(
2460 func_addr.bind(I64),
2461 rec_got_fnaddr8.opcodes(&MOV_LOAD).rex().w(),
2462 is_pic,
2463 );
2464
2465 // Global addresses.
2466
2467 // Non-PIC.
2468 e.enc32_isap(
2469 symbol_value.bind(I32),
2470 rec_gvaddr4.opcodes(&MOV_IMM),
2471 not_is_pic,
2472 );
2473 e.enc64_isap(
2474 symbol_value.bind(I64),
2475 rec_gvaddr8.opcodes(&MOV_IMM).rex().w(),
2476 not_is_pic,
2477 );
2478
2479 // PIC, colocated.
2480 e.enc64_func(
2481 symbol_value.bind(I64),
2482 rec_pcrel_gvaddr8.opcodes(&LEA).rex().w(),
2483 |encoding| {
2484 encoding
2485 .isa_predicate(is_pic)
2486 .inst_predicate(InstructionPredicate::new_is_colocated_data(formats))
2487 },
2488 );
2489
2490 // PIC, non-colocated.
2491 e.enc64_isap(
2492 symbol_value.bind(I64),
2493 rec_got_gvaddr8.opcodes(&MOV_LOAD).rex().w(),
2494 is_pic,
2495 );
2496
2497 // Stack addresses.
2498 //
2499 // TODO: Add encoding rules for stack_load and stack_store, so that they
2500 // don't get legalized to stack_addr + load/store.
2501 e.enc64(stack_addr.bind(I64), rec_spaddr_id.opcodes(&LEA).rex().w());
2502 e.enc32(stack_addr.bind(I32), rec_spaddr_id.opcodes(&LEA));
2503
2504 // Constant addresses (PIC).
2505 e.enc64(const_addr.bind(I64), rec_const_addr.opcodes(&LEA).rex().w());
2506 e.enc32(const_addr.bind(I32), rec_const_addr.opcodes(&LEA));
2507 }
2508
2509 /// Control flow opcodes.
2510 #[inline(never)]
define_control_flow( e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, settings: &SettingGroup, r: &RecipeGroup, )2511 fn define_control_flow(
2512 e: &mut PerCpuModeEncodings,
2513 shared_defs: &SharedDefinitions,
2514 settings: &SettingGroup,
2515 r: &RecipeGroup,
2516 ) {
2517 let shared = &shared_defs.instructions;
2518 let formats = &shared_defs.formats;
2519
2520 // Shorthands for instructions.
2521 let brff = shared.by_name("brff");
2522 let brif = shared.by_name("brif");
2523 let brnz = shared.by_name("brnz");
2524 let brz = shared.by_name("brz");
2525 let call = shared.by_name("call");
2526 let call_indirect = shared.by_name("call_indirect");
2527 let debugtrap = shared.by_name("debugtrap");
2528 let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
2529 let jump = shared.by_name("jump");
2530 let jump_table_base = shared.by_name("jump_table_base");
2531 let jump_table_entry = shared.by_name("jump_table_entry");
2532 let return_ = shared.by_name("return");
2533 let trap = shared.by_name("trap");
2534 let trapff = shared.by_name("trapff");
2535 let trapif = shared.by_name("trapif");
2536 let resumable_trap = shared.by_name("resumable_trap");
2537
2538 // Shorthands for recipes.
2539 let rec_brfb = r.template("brfb");
2540 let rec_brfd = r.template("brfd");
2541 let rec_brib = r.template("brib");
2542 let rec_brid = r.template("brid");
2543 let rec_call_id = r.template("call_id");
2544 let rec_call_plt_id = r.template("call_plt_id");
2545 let rec_call_r = r.template("call_r");
2546 let rec_debugtrap = r.recipe("debugtrap");
2547 let rec_indirect_jmp = r.template("indirect_jmp");
2548 let rec_jmpb = r.template("jmpb");
2549 let rec_jmpd = r.template("jmpd");
2550 let rec_jt_base = r.template("jt_base");
2551 let rec_jt_entry = r.template("jt_entry");
2552 let rec_ret = r.template("ret");
2553 let rec_t8jccb_abcd = r.template("t8jccb_abcd");
2554 let rec_t8jccd_abcd = r.template("t8jccd_abcd");
2555 let rec_t8jccd_long = r.template("t8jccd_long");
2556 let rec_tjccb = r.template("tjccb");
2557 let rec_tjccd = r.template("tjccd");
2558 let rec_trap = r.template("trap");
2559 let rec_trapif = r.recipe("trapif");
2560 let rec_trapff = r.recipe("trapff");
2561
2562 // Predicates shorthands.
2563 let is_pic = settings.predicate_by_name("is_pic");
2564
2565 // Call/return
2566
2567 // 32-bit, both PIC and non-PIC.
2568 e.enc32(call, rec_call_id.opcodes(&CALL_RELATIVE));
2569
2570 // 64-bit, colocated, both PIC and non-PIC. Use the call instruction's pc-relative field.
2571 let is_colocated_func = InstructionPredicate::new_is_colocated_func(&*formats.call, "func_ref");
2572 e.enc64_instp(call, rec_call_id.opcodes(&CALL_RELATIVE), is_colocated_func);
2573
2574 // 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version, since non-PIC
2575 // is currently using the large model, which requires calls be lowered to
2576 // func_addr+call_indirect.
2577 e.enc64_isap(call, rec_call_plt_id.opcodes(&CALL_RELATIVE), is_pic);
2578
2579 e.enc32(
2580 call_indirect.bind(I32),
2581 rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
2582 );
2583 e.enc64(
2584 call_indirect.bind(I64),
2585 rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2).rex(),
2586 );
2587 e.enc64(
2588 call_indirect.bind(I64),
2589 rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
2590 );
2591
2592 e.enc32(return_, rec_ret.opcodes(&RET_NEAR));
2593 e.enc64(return_, rec_ret.opcodes(&RET_NEAR));
2594
2595 // Branches.
2596 e.enc32(jump, rec_jmpb.opcodes(&JUMP_SHORT));
2597 e.enc64(jump, rec_jmpb.opcodes(&JUMP_SHORT));
2598 e.enc32(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
2599 e.enc64(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
2600
2601 e.enc_both(brif, rec_brib.opcodes(&JUMP_SHORT_IF_OVERFLOW));
2602 e.enc_both(brif, rec_brid.opcodes(&JUMP_NEAR_IF_OVERFLOW));
2603
2604 // Not all float condition codes are legal, see `supported_floatccs`.
2605 e.enc_both(brff, rec_brfb.opcodes(&JUMP_SHORT_IF_OVERFLOW));
2606 e.enc_both(brff, rec_brfd.opcodes(&JUMP_NEAR_IF_OVERFLOW));
2607
2608 // Note that the tjccd opcode will be prefixed with 0x0f.
2609 e.enc_i32_i64_explicit_rex(brz, rec_tjccb.opcodes(&JUMP_SHORT_IF_EQUAL));
2610 e.enc_i32_i64_explicit_rex(brz, rec_tjccd.opcodes(&TEST_BYTE_REG));
2611 e.enc_i32_i64_explicit_rex(brnz, rec_tjccb.opcodes(&JUMP_SHORT_IF_NOT_EQUAL));
2612 e.enc_i32_i64_explicit_rex(brnz, rec_tjccd.opcodes(&TEST_REG));
2613
2614 // Branch on a b1 value in a register only looks at the low 8 bits. See also
2615 // bint encodings below.
2616 //
2617 // Start with the worst-case encoding for X86_32 only. The register allocator
2618 // can't handle a branch with an ABCD-constrained operand.
2619 e.enc32(brz.bind(B1), rec_t8jccd_long.opcodes(&TEST_BYTE_REG));
2620 e.enc32(brnz.bind(B1), rec_t8jccd_long.opcodes(&TEST_REG));
2621
2622 e.enc_both(brz.bind(B1), rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_EQUAL));
2623 e.enc_both(brz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_BYTE_REG));
2624 e.enc_both(
2625 brnz.bind(B1),
2626 rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_NOT_EQUAL),
2627 );
2628 e.enc_both(brnz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_REG));
2629
2630 // Jump tables.
2631 e.enc64(
2632 jump_table_entry.bind(I64),
2633 rec_jt_entry.opcodes(&MOVSXD).rex().w(),
2634 );
2635 e.enc32(jump_table_entry.bind(I32), rec_jt_entry.opcodes(&MOV_LOAD));
2636
2637 e.enc64(
2638 jump_table_base.bind(I64),
2639 rec_jt_base.opcodes(&LEA).rex().w(),
2640 );
2641 e.enc32(jump_table_base.bind(I32), rec_jt_base.opcodes(&LEA));
2642
2643 e.enc_x86_64(
2644 indirect_jump_table_br.bind(I64),
2645 rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
2646 );
2647 e.enc32(
2648 indirect_jump_table_br.bind(I32),
2649 rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
2650 );
2651
2652 // Trap as ud2
2653 e.enc32(trap, rec_trap.opcodes(&UNDEFINED2));
2654 e.enc64(trap, rec_trap.opcodes(&UNDEFINED2));
2655 e.enc32(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
2656 e.enc64(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
2657
2658 // Debug trap as int3
2659 e.enc32_rec(debugtrap, rec_debugtrap, 0);
2660 e.enc64_rec(debugtrap, rec_debugtrap, 0);
2661
2662 e.enc32_rec(trapif, rec_trapif, 0);
2663 e.enc64_rec(trapif, rec_trapif, 0);
2664 e.enc32_rec(trapff, rec_trapff, 0);
2665 e.enc64_rec(trapff, rec_trapff, 0);
2666 }
2667
2668 /// Reference type instructions.
2669 #[inline(never)]
define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup)2670 fn define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
2671 let shared = &shared_defs.instructions;
2672
2673 let is_null = shared.by_name("is_null");
2674 let is_invalid = shared.by_name("is_invalid");
2675 let null = shared.by_name("null");
2676 let safepoint = shared.by_name("safepoint");
2677
2678 let rec_is_zero = r.template("is_zero");
2679 let rec_is_invalid = r.template("is_invalid");
2680 let rec_pu_id_ref = r.template("pu_id_ref");
2681 let rec_safepoint = r.recipe("safepoint");
2682
2683 // Null references implemented as iconst 0.
2684 e.enc32(null.bind(R32), rec_pu_id_ref.opcodes(&MOV_IMM));
2685
2686 e.enc64(null.bind(R64), rec_pu_id_ref.rex().opcodes(&MOV_IMM));
2687 e.enc64(null.bind(R64), rec_pu_id_ref.opcodes(&MOV_IMM));
2688
2689 // is_null, implemented by testing whether the value is 0.
2690 e.enc_r32_r64_rex_only(is_null, rec_is_zero.opcodes(&TEST_REG));
2691
2692 // is_invalid, implemented by testing whether the value is -1.
2693 e.enc_r32_r64_rex_only(is_invalid, rec_is_invalid.opcodes(&CMP_IMM8).rrr(7));
2694
2695 // safepoint instruction calls sink, no actual encoding.
2696 e.enc32_rec(safepoint, rec_safepoint, 0);
2697 e.enc64_rec(safepoint, rec_safepoint, 0);
2698 }
2699
2700 #[allow(clippy::cognitive_complexity)]
define( shared_defs: &SharedDefinitions, settings: &SettingGroup, x86: &InstructionGroup, r: &RecipeGroup, ) -> PerCpuModeEncodings2701 pub(crate) fn define(
2702 shared_defs: &SharedDefinitions,
2703 settings: &SettingGroup,
2704 x86: &InstructionGroup,
2705 r: &RecipeGroup,
2706 ) -> PerCpuModeEncodings {
2707 // Definitions.
2708 let mut e = PerCpuModeEncodings::new();
2709
2710 define_moves(&mut e, shared_defs, r);
2711 define_memory(&mut e, shared_defs, x86, r);
2712 define_fpu_moves(&mut e, shared_defs, r);
2713 define_fpu_memory(&mut e, shared_defs, r);
2714 define_fpu_ops(&mut e, shared_defs, settings, x86, r);
2715 define_alu(&mut e, shared_defs, settings, x86, r);
2716 define_simd(&mut e, shared_defs, settings, x86, r);
2717 define_entity_ref(&mut e, shared_defs, settings, r);
2718 define_control_flow(&mut e, shared_defs, settings, r);
2719 define_reftypes(&mut e, shared_defs, r);
2720
2721 let x86_elf_tls_get_addr = x86.by_name("x86_elf_tls_get_addr");
2722 let x86_macho_tls_get_addr = x86.by_name("x86_macho_tls_get_addr");
2723
2724 let rec_elf_tls_get_addr = r.recipe("elf_tls_get_addr");
2725 let rec_macho_tls_get_addr = r.recipe("macho_tls_get_addr");
2726
2727 e.enc64_rec(x86_elf_tls_get_addr, rec_elf_tls_get_addr, 0);
2728 e.enc64_rec(x86_macho_tls_get_addr, rec_macho_tls_get_addr, 0);
2729
2730 e
2731 }
2732