1 /////////////////////////////////////////////////////////////////////////
2 // $Id: avx512.cc 13853 2020-05-19 16:01:23Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2013-2019 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28
29 #if BX_SUPPORT_EVEX
30
31 #include "simd_int.h"
32 #include "simd_compare.h"
33 #include "wide_int.h"
34
35 // compare
36
37 typedef Bit32u (*avx512_compare_method)(const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2);
38
39 static avx512_compare_method avx512_compare8[8] = {
40 xmm_pcmpeqb_mask, // equal
41 xmm_pcmpltb_mask, // less than
42 xmm_pcmpleb_mask, // less or equal than
43 xmm_pcmpfalse_mask, // false
44 xmm_pcmpneb_mask, // not equal
45 xmm_pcmpgeb_mask, // not less than => greater or equal than
46 xmm_pcmpgtb_mask, // not less or equal than => greater than
47 xmm_pcmptrueb_mask // true
48 };
49
50 static avx512_compare_method avx512_compare16[8] = {
51 xmm_pcmpeqw_mask, // equal
52 xmm_pcmpltw_mask, // less than
53 xmm_pcmplew_mask, // less or equal than
54 xmm_pcmpfalse_mask, // false
55 xmm_pcmpnew_mask, // not equal
56 xmm_pcmpgew_mask, // not less than => greater or equal than
57 xmm_pcmpgtw_mask, // not less or equal than => greater than
58 xmm_pcmptruew_mask // true
59 };
60
61 static avx512_compare_method avx512_compare32[8] = {
62 xmm_pcmpeqd_mask, // equal
63 xmm_pcmpltd_mask, // less than
64 xmm_pcmpled_mask, // less or equal than
65 xmm_pcmpfalse_mask, // false
66 xmm_pcmpned_mask, // not equal
67 xmm_pcmpged_mask, // not less than => greater or equal than
68 xmm_pcmpgtd_mask, // not less or equal than => greater than
69 xmm_pcmptrued_mask // true
70 };
71
72 static avx512_compare_method avx512_compare64[8] = {
73 xmm_pcmpeqq_mask, // equal
74 xmm_pcmpltq_mask, // less than
75 xmm_pcmpleq_mask, // less or equal than
76 xmm_pcmpfalse_mask, // false
77 xmm_pcmpneq_mask, // not equal
78 xmm_pcmpgeq_mask, // not less than => greater or equal than
79 xmm_pcmpgtq_mask, // not less or equal than => greater than
80 xmm_pcmptrueq_mask // true
81 };
82
83 static avx512_compare_method avx512_compare8u[8] = {
84 xmm_pcmpeqb_mask, // equal
85 xmm_pcmpltub_mask, // less than
86 xmm_pcmpleub_mask, // less or equal than
87 xmm_pcmpfalse_mask, // false
88 xmm_pcmpneb_mask, // not equal
89 xmm_pcmpgeub_mask, // not less than => greater or equal than
90 xmm_pcmpgtub_mask, // not less or equal than => greater than
91 xmm_pcmptrueb_mask // true
92 };
93
94 static avx512_compare_method avx512_compare16u[8] = {
95 xmm_pcmpeqw_mask, // equal
96 xmm_pcmpltuw_mask, // less than
97 xmm_pcmpleuw_mask, // less or equal than
98 xmm_pcmpfalse_mask, // false
99 xmm_pcmpnew_mask, // not equal
100 xmm_pcmpgeuw_mask, // not less than => greater or equal than
101 xmm_pcmpgtuw_mask, // not less or equal than => greater than
102 xmm_pcmptruew_mask // true
103 };
104
105 static avx512_compare_method avx512_compare32u[8] = {
106 xmm_pcmpeqd_mask, // equal
107 xmm_pcmpltud_mask, // less than
108 xmm_pcmpleud_mask, // less or equal than
109 xmm_pcmpfalse_mask, // false
110 xmm_pcmpned_mask, // not equal
111 xmm_pcmpgeud_mask, // not less than => greater or equal than
112 xmm_pcmpgtud_mask, // not less or equal than => greater than
113 xmm_pcmptrued_mask // true
114 };
115
116 static avx512_compare_method avx512_compare64u[8] = {
117 xmm_pcmpeqq_mask, // equal
118 xmm_pcmpltuq_mask, // less than
119 xmm_pcmpleuq_mask, // less or equal than
120 xmm_pcmpfalse_mask, // false
121 xmm_pcmpneq_mask, // not equal
122 xmm_pcmpgeuq_mask, // not less than => greater or equal than
123 xmm_pcmpgtuq_mask, // not less or equal than => greater than
124 xmm_pcmptrueq_mask // true
125 };
126
VPCMPB_MASK_KGqHdqWdqIbR(bxInstruction_c * i)127 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPB_MASK_KGqHdqWdqIbR(bxInstruction_c *i)
128 {
129 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
130 unsigned len = i->getVL();
131 unsigned ib = i->Ib() & 7;
132
133 Bit64u result = 0;
134 for (int n=len-1; n >= 0; n--) {
135 result <<= 16;
136 result |= avx512_compare8[ib](&op1.vmm128(n), &op2.vmm128(n));
137 }
138
139 if (i->opmask())
140 result &= BX_READ_OPMASK(i->opmask());
141
142 BX_WRITE_OPMASK(i->dst(), result);
143 BX_NEXT_INSTR(i);
144 }
145
VPCMPUB_MASK_KGqHdqWdqIbR(bxInstruction_c * i)146 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUB_MASK_KGqHdqWdqIbR(bxInstruction_c *i)
147 {
148 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
149 unsigned len = i->getVL();
150 unsigned ib = i->Ib() & 7;
151
152 Bit64u result = 0;
153 for (int n=len-1; n >= 0; n--) {
154 result <<= 16;
155 result |= avx512_compare8u[ib](&op1.vmm128(n), &op2.vmm128(n));
156 }
157
158 if (i->opmask())
159 result &= BX_READ_OPMASK(i->opmask());
160
161 BX_WRITE_OPMASK(i->dst(), result);
162 BX_NEXT_INSTR(i);
163 }
164
VPCMPW_MASK_KGdHdqWdqIbR(bxInstruction_c * i)165 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPW_MASK_KGdHdqWdqIbR(bxInstruction_c *i)
166 {
167 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
168 unsigned len = i->getVL();
169 unsigned ib = i->Ib() & 7;
170
171 Bit32u result = 0;
172 for (int n=len-1; n >= 0; n--) {
173 result <<= 8;
174 result |= avx512_compare16[ib](&op1.vmm128(n), &op2.vmm128(n));
175 }
176
177 if (i->opmask())
178 result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask());
179
180 BX_WRITE_OPMASK(i->dst(), result);
181 BX_NEXT_INSTR(i);
182 }
183
VPCMPUW_MASK_KGdHdqWdqIbR(bxInstruction_c * i)184 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUW_MASK_KGdHdqWdqIbR(bxInstruction_c *i)
185 {
186 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
187 unsigned len = i->getVL();
188 unsigned ib = i->Ib() & 7;
189
190 Bit32u result = 0;
191 for (int n=len-1; n >= 0; n--) {
192 result <<= 8;
193 result |= avx512_compare16u[ib](&op1.vmm128(n), &op2.vmm128(n));
194 }
195
196 if (i->opmask())
197 result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask());
198
199 BX_WRITE_OPMASK(i->dst(), result);
200 BX_NEXT_INSTR(i);
201 }
202
VPCMPD_MASK_KGwHdqWdqIbR(bxInstruction_c * i)203 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPD_MASK_KGwHdqWdqIbR(bxInstruction_c *i)
204 {
205 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
206 unsigned len = i->getVL();
207 unsigned ib = i->Ib() & 7;
208
209 Bit32u result = 0;
210 for (int n=len-1; n >= 0; n--) {
211 result <<= 4;
212 result |= avx512_compare32[ib](&op1.vmm128(n), &op2.vmm128(n));
213 }
214
215 if (i->opmask())
216 result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask());
217
218 BX_WRITE_OPMASK(i->dst(), result);
219 BX_NEXT_INSTR(i);
220 }
221
VPCMPUD_MASK_KGwHdqWdqIbR(bxInstruction_c * i)222 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUD_MASK_KGwHdqWdqIbR(bxInstruction_c *i)
223 {
224 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
225 unsigned len = i->getVL();
226 unsigned ib = i->Ib() & 7;
227
228 Bit32u result = 0;
229 for (int n=len-1; n >= 0; n--) {
230 result <<= 4;
231 result |= avx512_compare32u[ib](&op1.vmm128(n), &op2.vmm128(n));
232 }
233
234 if (i->opmask())
235 result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask());
236
237 BX_WRITE_OPMASK(i->dst(), result);
238 BX_NEXT_INSTR(i);
239 }
240
VPCMPQ_MASK_KGbHdqWdqIbR(bxInstruction_c * i)241 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPQ_MASK_KGbHdqWdqIbR(bxInstruction_c *i)
242 {
243 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
244 unsigned len = i->getVL();
245 unsigned ib = i->Ib() & 7;
246
247 Bit32u result = 0;
248 for (int n=len-1; n >= 0; n--) {
249 result <<= 2;
250 result |= avx512_compare64[ib](&op1.vmm128(n), &op2.vmm128(n));
251 }
252
253 if (i->opmask())
254 result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask());
255
256 BX_WRITE_OPMASK(i->dst(), result);
257 BX_NEXT_INSTR(i);
258 }
259
VPCMPUQ_MASK_KGbHdqWdqIbR(bxInstruction_c * i)260 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUQ_MASK_KGbHdqWdqIbR(bxInstruction_c *i)
261 {
262 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
263 unsigned len = i->getVL();
264 unsigned ib = i->Ib() & 7;
265
266 Bit32u result = 0;
267 for (int n=len-1; n >= 0; n--) {
268 result <<= 2;
269 result |= avx512_compare64u[ib](&op1.vmm128(n), &op2.vmm128(n));
270 }
271
272 if (i->opmask())
273 result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask());
274
275 BX_WRITE_OPMASK(i->dst(), result);
276 BX_NEXT_INSTR(i);
277 }
278
279 ///////////////////////////////////////////////////////////////////////////////////////////
280
281 #define AVX512_COMPARE_BYTE_EL(HANDLER, func) \
282 /* AVX-512 compare instruction with two src operands working on BYTE elements */ \
283 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
284 { \
285 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
286 unsigned len = i->getVL(); \
287 \
288 Bit64u result = 0; \
289 for (int n=len-1; n >= 0; n--) { \
290 result <<= 16; \
291 result |= (func)(&op1.vmm128(n), &op2.vmm128(n)); \
292 } \
293 \
294 if (i->opmask()) \
295 result &= BX_READ_OPMASK(i->opmask()); \
296 \
297 BX_WRITE_OPMASK(i->dst(), result); \
298 BX_NEXT_INSTR(i); \
299 }
300
301 AVX512_COMPARE_BYTE_EL(VPCMPGTB_MASK_KGqHdqWdqR, xmm_pcmpgtb_mask)
302 AVX512_COMPARE_BYTE_EL(VPCMPEQB_MASK_KGqHdqWdqR, xmm_pcmpeqb_mask)
303 AVX512_COMPARE_BYTE_EL(VPTESTMB_MASK_KGqHdqWdqR, xmm_ptestmb_mask)
304 AVX512_COMPARE_BYTE_EL(VPTESTNMB_MASK_KGqHdqWdqR, xmm_ptestnmb_mask)
305
306 #define AVX512_COMPARE_WORD_EL(HANDLER, func) \
307 /* AVX-512 compare instruction with two src operands working on WORD elements */ \
308 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
309 { \
310 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
311 unsigned len = i->getVL(); \
312 \
313 Bit32u result = 0; \
314 for (int n=len-1; n >= 0; n--) { \
315 result <<= 8; \
316 result |= (func)(&op1.vmm128(n), &op2.vmm128(n)); \
317 } \
318 \
319 if (i->opmask()) \
320 result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask()); \
321 \
322 BX_WRITE_OPMASK(i->dst(), result); \
323 BX_NEXT_INSTR(i); \
324 }
325
326 AVX512_COMPARE_WORD_EL(VPCMPGTW_MASK_KGdHdqWdqR, xmm_pcmpgtw_mask)
327 AVX512_COMPARE_WORD_EL(VPCMPEQW_MASK_KGdHdqWdqR, xmm_pcmpeqw_mask)
328 AVX512_COMPARE_WORD_EL(VPTESTMW_MASK_KGdHdqWdqR, xmm_ptestmw_mask)
329 AVX512_COMPARE_WORD_EL(VPTESTNMW_MASK_KGdHdqWdqR, xmm_ptestnmw_mask)
330
331 #define AVX512_COMPARE_DWORD_EL(HANDLER, func) \
332 /* AVX-512 compare instruction with two src operands working on DWORD elements */ \
333 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
334 { \
335 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
336 unsigned len = i->getVL(); \
337 \
338 Bit32u result = 0; \
339 for (int n=len-1; n >= 0; n--) { \
340 result <<= 4; \
341 result |= (func)(&op1.vmm128(n), &op2.vmm128(n)); \
342 } \
343 \
344 if (i->opmask()) \
345 result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask()); \
346 \
347 BX_WRITE_OPMASK(i->dst(), result); \
348 BX_NEXT_INSTR(i); \
349 }
350
351 AVX512_COMPARE_DWORD_EL(VPCMPGTD_MASK_KGwHdqWdqR, xmm_pcmpgtd_mask)
352 AVX512_COMPARE_DWORD_EL(VPCMPEQD_MASK_KGwHdqWdqR, xmm_pcmpeqd_mask)
353 AVX512_COMPARE_DWORD_EL(VPTESTMD_MASK_KGwHdqWdqR, xmm_ptestmd_mask)
354 AVX512_COMPARE_DWORD_EL(VPTESTNMD_MASK_KGwHdqWdqR, xmm_ptestnmd_mask)
355
356 #define AVX512_COMPARE_QWORD_EL(HANDLER, func) \
357 /* AVX-512 compare instruction with two src operands working on QWORD elements */ \
358 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
359 { \
360 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
361 unsigned len = i->getVL(); \
362 \
363 Bit32u result = 0; \
364 for (int n=len-1; n >= 0; n--) { \
365 result <<= 2; \
366 result |= (func)(&op1.vmm128(n), &op2.vmm128(n)); \
367 } \
368 \
369 if (i->opmask()) \
370 result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask()); \
371 \
372 BX_WRITE_OPMASK(i->dst(), result); \
373 BX_NEXT_INSTR(i); \
374 }
375
376 AVX512_COMPARE_QWORD_EL(VPCMPGTQ_MASK_KGbHdqWdqR, xmm_pcmpgtq_mask)
377 AVX512_COMPARE_QWORD_EL(VPCMPEQQ_MASK_KGbHdqWdqR, xmm_pcmpeqq_mask)
378 AVX512_COMPARE_QWORD_EL(VPTESTMQ_MASK_KGbHdqWdqR, xmm_ptestmq_mask)
379 AVX512_COMPARE_QWORD_EL(VPTESTNMQ_MASK_KGbHdqWdqR, xmm_ptestnmq_mask)
380
381 // compute, shift and rotate
382
383 #define AVX512_2OP_QWORD_EL(HANDLER, func) \
384 /* AVX-512 instruction with two src operands working on QWORD elements */ \
385 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
386 { \
387 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
388 unsigned len = i->getVL(); \
389 \
390 for (unsigned n=0; n < len; n++) \
391 (func)(&op1.vmm128(n), &op2.vmm128(n)); \
392 \
393 avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask())); \
394 \
395 BX_NEXT_INSTR(i); \
396 }
397
398 AVX512_2OP_QWORD_EL(VPMULLQ_MASK_VdqHdqWdqR, xmm_pmullq)
399 AVX512_2OP_QWORD_EL(VPADDQ_MASK_VdqHdqWdqR, xmm_paddq)
400 AVX512_2OP_QWORD_EL(VPSUBQ_MASK_VdqHdqWdqR, xmm_psubq)
401 AVX512_2OP_QWORD_EL(VPANDQ_MASK_VdqHdqWdqR, xmm_andps)
402 AVX512_2OP_QWORD_EL(VPANDNQ_MASK_VdqHdqWdqR, xmm_andnps)
403 AVX512_2OP_QWORD_EL(VPORQ_MASK_VdqHdqWdqR, xmm_orps)
404 AVX512_2OP_QWORD_EL(VPXORQ_MASK_VdqHdqWdqR, xmm_xorps)
405 AVX512_2OP_QWORD_EL(VPMAXSQ_MASK_VdqHdqWdqR, xmm_pmaxsq)
406 AVX512_2OP_QWORD_EL(VPMAXUQ_MASK_VdqHdqWdqR, xmm_pmaxuq)
407 AVX512_2OP_QWORD_EL(VPMINSQ_MASK_VdqHdqWdqR, xmm_pminsq)
408 AVX512_2OP_QWORD_EL(VPMINUQ_MASK_VdqHdqWdqR, xmm_pminuq)
409 AVX512_2OP_QWORD_EL(VUNPCKLPD_MASK_VpdHpdWpdR, xmm_unpcklpd)
410 AVX512_2OP_QWORD_EL(VUNPCKHPD_MASK_VpdHpdWpdR, xmm_unpckhpd)
411 AVX512_2OP_QWORD_EL(VPMULDQ_MASK_VdqHdqWdqR, xmm_pmuldq)
412 AVX512_2OP_QWORD_EL(VPMULUDQ_MASK_VdqHdqWdqR, xmm_pmuludq)
413 AVX512_2OP_QWORD_EL(VPSRAVQ_MASK_VdqHdqWdqR, xmm_psravq)
414 AVX512_2OP_QWORD_EL(VPSRLVQ_MASK_VdqHdqWdqR, xmm_psrlvq)
415 AVX512_2OP_QWORD_EL(VPSLLVQ_MASK_VdqHdqWdqR, xmm_psllvq)
416 AVX512_2OP_QWORD_EL(VPRORVQ_MASK_VdqHdqWdqR, xmm_prorvq)
417 AVX512_2OP_QWORD_EL(VPROLVQ_MASK_VdqHdqWdqR, xmm_prolvq)
418
419 #define AVX512_2OP_DWORD_EL(HANDLER, func) \
420 /* AVX-512 instruction with two src operands working on DWORD elements */ \
421 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
422 { \
423 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
424 unsigned len = i->getVL(); \
425 \
426 for (unsigned n=0; n < len; n++) \
427 (func)(&op1.vmm128(n), &op2.vmm128(n)); \
428 \
429 avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask())); \
430 \
431 BX_NEXT_INSTR(i); \
432 }
433
434 AVX512_2OP_DWORD_EL(VPADDD_MASK_VdqHdqWdqR, xmm_paddd)
435 AVX512_2OP_DWORD_EL(VPSUBD_MASK_VdqHdqWdqR, xmm_psubd)
436 AVX512_2OP_DWORD_EL(VPANDD_MASK_VdqHdqWdqR, xmm_andps)
437 AVX512_2OP_DWORD_EL(VPANDND_MASK_VdqHdqWdqR, xmm_andnps)
438 AVX512_2OP_DWORD_EL(VPORD_MASK_VdqHdqWdqR, xmm_orps)
439 AVX512_2OP_DWORD_EL(VPXORD_MASK_VdqHdqWdqR, xmm_xorps)
440 AVX512_2OP_DWORD_EL(VPMAXSD_MASK_VdqHdqWdqR, xmm_pmaxsd)
441 AVX512_2OP_DWORD_EL(VPMAXUD_MASK_VdqHdqWdqR, xmm_pmaxud)
442 AVX512_2OP_DWORD_EL(VPMINSD_MASK_VdqHdqWdqR, xmm_pminsd)
443 AVX512_2OP_DWORD_EL(VPMINUD_MASK_VdqHdqWdqR, xmm_pminud)
444 AVX512_2OP_DWORD_EL(VUNPCKLPS_MASK_VpsHpsWpsR, xmm_unpcklps)
445 AVX512_2OP_DWORD_EL(VUNPCKHPS_MASK_VpsHpsWpsR, xmm_unpckhps)
446 AVX512_2OP_DWORD_EL(VPMULLD_MASK_VdqHdqWdqR, xmm_pmulld)
447 AVX512_2OP_DWORD_EL(VPSRAVD_MASK_VdqHdqWdqR, xmm_psravd)
448 AVX512_2OP_DWORD_EL(VPSRLVD_MASK_VdqHdqWdqR, xmm_psrlvd)
449 AVX512_2OP_DWORD_EL(VPSLLVD_MASK_VdqHdqWdqR, xmm_psllvd)
450 AVX512_2OP_DWORD_EL(VPRORVD_MASK_VdqHdqWdqR, xmm_prorvd)
451 AVX512_2OP_DWORD_EL(VPROLVD_MASK_VdqHdqWdqR, xmm_prolvd)
452 AVX512_2OP_DWORD_EL(VPMADDWD_MASK_VdqHdqWdqR, xmm_pmaddwd)
453
454 #define AVX512_2OP_WORD_EL(HANDLER, func) \
455 /* AVX-512 instruction with two src operands working on WORD elements */ \
456 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
457 { \
458 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
459 unsigned len = i->getVL(); \
460 \
461 for (unsigned n=0; n < len; n++) \
462 (func)(&op1.vmm128(n), &op2.vmm128(n)); \
463 \
464 avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask())); \
465 \
466 BX_NEXT_INSTR(i); \
467 }
468
469 AVX512_2OP_WORD_EL(VPADDW_MASK_VdqHdqWdqR, xmm_paddw)
470 AVX512_2OP_WORD_EL(VPADDSW_MASK_VdqHdqWdqR, xmm_paddsw)
471 AVX512_2OP_WORD_EL(VPADDUSW_MASK_VdqHdqWdqR, xmm_paddusw)
472 AVX512_2OP_WORD_EL(VPSUBW_MASK_VdqHdqWdqR, xmm_psubw)
473 AVX512_2OP_WORD_EL(VPSUBSW_MASK_VdqHdqWdqR, xmm_psubsw)
474 AVX512_2OP_WORD_EL(VPSUBUSW_MASK_VdqHdqWdqR, xmm_psubusw)
475 AVX512_2OP_WORD_EL(VPMINSW_MASK_VdqHdqWdqR, xmm_pminsw)
476 AVX512_2OP_WORD_EL(VPMINUW_MASK_VdqHdqWdqR, xmm_pminuw)
477 AVX512_2OP_WORD_EL(VPMAXSW_MASK_VdqHdqWdqR, xmm_pmaxsw)
478 AVX512_2OP_WORD_EL(VPMAXUW_MASK_VdqHdqWdqR, xmm_pmaxuw)
479 AVX512_2OP_WORD_EL(VPMADDUBSW_MASK_VdqHdqWdqR, xmm_pmaddubsw)
480 AVX512_2OP_WORD_EL(VPAVGW_MASK_VdqHdqWdqR, xmm_pavgw)
481 AVX512_2OP_WORD_EL(VPMULLW_MASK_VdqHdqWdqR, xmm_pmullw)
482 AVX512_2OP_WORD_EL(VPMULHW_MASK_VdqHdqWdqR, xmm_pmulhw)
483 AVX512_2OP_WORD_EL(VPMULHUW_MASK_VdqHdqWdqR, xmm_pmulhuw)
484 AVX512_2OP_WORD_EL(VPMULHRSW_MASK_VdqHdqWdqR, xmm_pmulhrsw)
485 AVX512_2OP_WORD_EL(VPACKSSDW_MASK_VdqHdqWdqR, xmm_packssdw)
486 AVX512_2OP_WORD_EL(VPACKUSDW_MASK_VdqHdqWdqR, xmm_packusdw)
487 AVX512_2OP_WORD_EL(VPUNPCKLWD_MASK_VdqHdqWdqR, xmm_punpcklwd)
488 AVX512_2OP_WORD_EL(VPUNPCKHWD_MASK_VdqHdqWdqR, xmm_punpckhwd)
489 AVX512_2OP_WORD_EL(VPSRAVW_MASK_VdqHdqWdqR, xmm_psravw)
490 AVX512_2OP_WORD_EL(VPSRLVW_MASK_VdqHdqWdqR, xmm_psrlvw)
491 AVX512_2OP_WORD_EL(VPSLLVW_MASK_VdqHdqWdqR, xmm_psllvw)
492
493 #define AVX512_2OP_BYTE_EL(HANDLER, func) \
494 /* AVX-512 instruction with two src operands working on BYTE elements */ \
495 void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i) \
496 { \
497 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
498 unsigned len = i->getVL(); \
499 \
500 for (unsigned n=0; n < len; n++) \
501 (func)(&op1.vmm128(n), &op2.vmm128(n)); \
502 \
503 avx512_write_regb_masked(i, &op1, len, BX_READ_OPMASK(i->opmask())); \
504 \
505 BX_NEXT_INSTR(i); \
506 }
507
508 AVX512_2OP_BYTE_EL(VPADDB_MASK_VdqHdqWdqR, xmm_paddb)
509 AVX512_2OP_BYTE_EL(VPADDSB_MASK_VdqHdqWdqR, xmm_paddsb)
510 AVX512_2OP_BYTE_EL(VPADDUSB_MASK_VdqHdqWdqR, xmm_paddusb)
511 AVX512_2OP_BYTE_EL(VPSUBB_MASK_VdqHdqWdqR, xmm_psubb)
512 AVX512_2OP_BYTE_EL(VPSUBSB_MASK_VdqHdqWdqR, xmm_psubsb)
513 AVX512_2OP_BYTE_EL(VPSUBUSB_MASK_VdqHdqWdqR, xmm_psubusb)
514 AVX512_2OP_BYTE_EL(VPMINSB_MASK_VdqHdqWdqR, xmm_pminsb)
515 AVX512_2OP_BYTE_EL(VPMINUB_MASK_VdqHdqWdqR, xmm_pminub)
516 AVX512_2OP_BYTE_EL(VPMAXUB_MASK_VdqHdqWdqR, xmm_pmaxub)
517 AVX512_2OP_BYTE_EL(VPMAXSB_MASK_VdqHdqWdqR, xmm_pmaxsb)
518 AVX512_2OP_BYTE_EL(VPAVGB_MASK_VdqHdqWdqR, xmm_pavgb)
519 AVX512_2OP_BYTE_EL(VPACKSSWB_MASK_VdqHdqWdqR, xmm_packsswb)
520 AVX512_2OP_BYTE_EL(VPACKUSWB_MASK_VdqHdqWdqR, xmm_packuswb)
521 AVX512_2OP_BYTE_EL(VPUNPCKLBW_MASK_VdqHdqWdqR, xmm_punpcklbw)
522 AVX512_2OP_BYTE_EL(VPUNPCKHBW_MASK_VdqHdqWdqR, xmm_punpckhbw)
523
524 #define AVX512_PSHIFT_WORD_EL(HANDLER, func) \
525 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
526 { \
527 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1()); \
528 Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2()); \
529 unsigned len = i->getVL(); \
530 \
531 for (unsigned n=0; n < len; n++) \
532 (func)(&op.vmm128(n), count); \
533 \
534 avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask())); \
535 \
536 BX_NEXT_INSTR(i); \
537 }
538
539 AVX512_PSHIFT_WORD_EL(VPSRLW_MASK_VdqHdqWdqR, xmm_psrlw);
540 AVX512_PSHIFT_WORD_EL(VPSRAW_MASK_VdqHdqWdqR, xmm_psraw);
541 AVX512_PSHIFT_WORD_EL(VPSLLW_MASK_VdqHdqWdqR, xmm_psllw);
542
543 #define AVX512_PSHIFT_DWORD_EL(HANDLER, func) \
544 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
545 { \
546 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1()); \
547 Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2()); \
548 unsigned len = i->getVL(); \
549 \
550 for (unsigned n=0; n < len; n++) \
551 (func)(&op.vmm128(n), count); \
552 \
553 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask())); \
554 \
555 BX_NEXT_INSTR(i); \
556 }
557
558 AVX512_PSHIFT_DWORD_EL(VPSRLD_MASK_VdqHdqWdqR, xmm_psrld);
559 AVX512_PSHIFT_DWORD_EL(VPSRAD_MASK_VdqHdqWdqR, xmm_psrad);
560 AVX512_PSHIFT_DWORD_EL(VPSLLD_MASK_VdqHdqWdqR, xmm_pslld);
561
562 #define AVX512_PSHIFT_QWORD_EL(HANDLER, func) \
563 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
564 { \
565 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1()); \
566 Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2()); \
567 unsigned len = i->getVL(); \
568 \
569 for (unsigned n=0; n < len; n++) \
570 (func)(&op.vmm128(n), count); \
571 \
572 avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask())); \
573 \
574 BX_NEXT_INSTR(i); \
575 }
576
577 AVX512_PSHIFT_QWORD_EL(VPSRLQ_MASK_VdqHdqWdqR, xmm_psrlq);
578 AVX512_PSHIFT_QWORD_EL(VPSRAQ_MASK_VdqHdqWdqR, xmm_psraq);
579 AVX512_PSHIFT_QWORD_EL(VPSLLQ_MASK_VdqHdqWdqR, xmm_psllq);
580
581 #define AVX512_PSHIFT_IMM_WORD_EL(HANDLER, func) \
582 /* AVX packed shift with imm8 instruction */ \
583 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
584 { \
585 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()); \
586 unsigned len = i->getVL(); \
587 \
588 for (unsigned n=0; n < len; n++) \
589 (func)(&op.vmm128(n), i->Ib()); \
590 \
591 avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask())); \
592 \
593 BX_NEXT_INSTR(i); \
594 }
595
596 AVX512_PSHIFT_IMM_WORD_EL(VPSRLW_MASK_UdqIb, xmm_psrlw);
597 AVX512_PSHIFT_IMM_WORD_EL(VPSRAW_MASK_UdqIb, xmm_psraw);
598 AVX512_PSHIFT_IMM_WORD_EL(VPSLLW_MASK_UdqIb, xmm_psllw);
599
600 #define AVX512_PSHIFT_IMM_DWORD_EL(HANDLER, func) \
601 /* AVX packed shift with imm8 instruction */ \
602 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
603 { \
604 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()); \
605 unsigned len = i->getVL(); \
606 \
607 for (unsigned n=0; n < len; n++) \
608 (func)(&op.vmm128(n), i->Ib()); \
609 \
610 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask())); \
611 \
612 BX_NEXT_INSTR(i); \
613 }
614
615 AVX512_PSHIFT_IMM_DWORD_EL(VPSRLD_MASK_UdqIb, xmm_psrld);
616 AVX512_PSHIFT_IMM_DWORD_EL(VPSRAD_MASK_UdqIb, xmm_psrad);
617 AVX512_PSHIFT_IMM_DWORD_EL(VPSLLD_MASK_UdqIb, xmm_pslld);
618 AVX512_PSHIFT_IMM_DWORD_EL(VPRORD_MASK_UdqIb, xmm_prord);
619 AVX512_PSHIFT_IMM_DWORD_EL(VPROLD_MASK_UdqIb, xmm_prold);
620
621 #define AVX512_PSHIFT_IMM_QWORD_EL(HANDLER, func) \
622 /* AVX packed shift with imm8 instruction */ \
623 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
624 { \
625 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()); \
626 unsigned len = i->getVL(); \
627 \
628 for (unsigned n=0; n < len; n++) \
629 (func)(&op.vmm128(n), i->Ib()); \
630 \
631 avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask())); \
632 \
633 BX_NEXT_INSTR(i); \
634 }
635
636 AVX512_PSHIFT_IMM_QWORD_EL(VPSRLQ_MASK_UdqIb, xmm_psrlq);
637 AVX512_PSHIFT_IMM_QWORD_EL(VPSRAQ_MASK_UdqIb, xmm_psraq);
638 AVX512_PSHIFT_IMM_QWORD_EL(VPSLLQ_MASK_UdqIb, xmm_psllq);
639 AVX512_PSHIFT_IMM_QWORD_EL(VPRORQ_MASK_UdqIb, xmm_prorq);
640 AVX512_PSHIFT_IMM_QWORD_EL(VPROLQ_MASK_UdqIb, xmm_prolq);
641
642 // concatenate and shift
643
VPSHLDW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)644 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
645 {
646 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
647 unsigned len = i->getVL();
648 unsigned count = i->Ib() & 15;
649
650 if (count) {
651 for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
652 op1.vmm16u(n) = (op1.vmm16u(n) << count) | (op2.vmm16u(n) >> (16 - count));
653 }
654 }
655
656 if (i->opmask())
657 avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask()));
658 else
659 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
660
661 BX_NEXT_INSTR(i);
662 }
663
VPSHLDVW_MASK_VdqHdqWdqR(bxInstruction_c * i)664 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVW_MASK_VdqHdqWdqR(bxInstruction_c *i)
665 {
666 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
667 unsigned len = i->getVL();
668
669 for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
670 unsigned count = op2.vmm16u(n) & 15;
671 if (count) {
672 dst.vmm16u(n) = (dst.vmm16u(n) << count) | (op1.vmm16u(n) >> (16 - count));
673 }
674 }
675
676 if (i->opmask())
677 avx512_write_regw_masked(i, &dst, len, BX_READ_32BIT_OPMASK(i->opmask()));
678 else
679 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
680
681 BX_NEXT_INSTR(i);
682 }
683
VPSHLDD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)684 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
685 {
686 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
687 unsigned len = i->getVL();
688 unsigned count = i->Ib() & 31;
689
690 if (count) {
691 for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
692 op1.vmm32u(n) = (op1.vmm32u(n) << count) | (op2.vmm32u(n) >> (32 - count));
693 }
694 }
695
696 if (i->opmask())
697 avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask()));
698 else
699 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
700
701 BX_NEXT_INSTR(i);
702 }
703
VPSHLDVD_MASK_VdqHdqWdqR(bxInstruction_c * i)704 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVD_MASK_VdqHdqWdqR(bxInstruction_c *i)
705 {
706 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
707 unsigned len = i->getVL();
708
709 for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
710 unsigned count = op2.vmm32u(n) & 31;
711 if (count) {
712 dst.vmm32u(n) = (dst.vmm32u(n) << count) | (op1.vmm32u(n) >> (32 - count));
713 }
714 }
715
716 if (i->opmask())
717 avx512_write_regd_masked(i, &dst, len, BX_READ_16BIT_OPMASK(i->opmask()));
718 else
719 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
720
721 BX_NEXT_INSTR(i);
722 }
723
VPSHLDQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)724 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
725 {
726 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
727 unsigned len = i->getVL();
728 unsigned count = i->Ib() & 63;
729
730 if (count) {
731 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
732 op1.vmm64u(n) = (op1.vmm64u(n) << count) | (op2.vmm64u(n) >> (64 - count));
733 }
734 }
735
736 if (i->opmask())
737 avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask()));
738 else
739 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
740
741 BX_NEXT_INSTR(i);
742 }
743
VPSHLDVQ_MASK_VdqHdqWdqR(bxInstruction_c * i)744 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
745 {
746 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
747 unsigned len = i->getVL();
748
749 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
750 unsigned count = op2.vmm64u(n) & 63;
751 if (count) {
752 dst.vmm64u(n) = (dst.vmm64u(n) << count) | (op1.vmm64u(n) >> (64 - count));
753 }
754 }
755
756 if (i->opmask())
757 avx512_write_regq_masked(i, &dst, len, BX_READ_8BIT_OPMASK(i->opmask()));
758 else
759 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
760
761 BX_NEXT_INSTR(i);
762 }
763
VPSHRDW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)764 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
765 {
766 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
767 unsigned len = i->getVL();
768 unsigned count = i->Ib() & 15;
769
770 if (count) {
771 for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
772 op1.vmm16u(n) = (op1.vmm16u(n) >> count) | (op2.vmm16u(n) << (16 - count));
773 }
774 }
775
776 if (i->opmask())
777 avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask()));
778 else
779 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
780
781 BX_NEXT_INSTR(i);
782 }
783
VPSHRDVW_MASK_VdqHdqWdqR(bxInstruction_c * i)784 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVW_MASK_VdqHdqWdqR(bxInstruction_c *i)
785 {
786 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
787 unsigned len = i->getVL();
788
789 for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
790 unsigned count = op2.vmm16u(n) & 15;
791 if (count) {
792 dst.vmm16u(n) = (dst.vmm16u(n) >> count) | (op1.vmm16u(n) << (16 - count));
793 }
794 }
795
796 if (i->opmask())
797 avx512_write_regw_masked(i, &dst, len, BX_READ_32BIT_OPMASK(i->opmask()));
798 else
799 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
800
801 BX_NEXT_INSTR(i);
802 }
803
VPSHRDD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)804 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
805 {
806 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
807 unsigned len = i->getVL();
808 unsigned count = i->Ib() & 31;
809
810 if (count) {
811 for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
812 op1.vmm32u(n) = (op1.vmm32u(n) >> count) | (op2.vmm32u(n) << (32 - count));
813 }
814 }
815
816 if (i->opmask())
817 avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask()));
818 else
819 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
820
821 BX_NEXT_INSTR(i);
822 }
823
VPSHRDVD_MASK_VdqHdqWdqR(bxInstruction_c * i)824 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVD_MASK_VdqHdqWdqR(bxInstruction_c *i)
825 {
826 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
827 unsigned len = i->getVL();
828
829 for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
830 unsigned count = op2.vmm32u(n) & 31;
831 if (count) {
832 dst.vmm32u(n) = (dst.vmm32u(n) >> count) | (op1.vmm32u(n) << (32 - count));
833 }
834 }
835
836 if (i->opmask())
837 avx512_write_regd_masked(i, &dst, len, BX_READ_16BIT_OPMASK(i->opmask()));
838 else
839 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
840
841 BX_NEXT_INSTR(i);
842 }
843
VPSHRDQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)844 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
845 {
846 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
847 unsigned len = i->getVL();
848 unsigned count = i->Ib() & 63;
849
850 if (count) {
851 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
852 op1.vmm64u(n) = (op1.vmm64u(n) >> count) | (op2.vmm64u(n) << (64 - count));
853 }
854 }
855
856 if (i->opmask())
857 avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask()));
858 else
859 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
860
861 BX_NEXT_INSTR(i);
862 }
863
VPSHRDVQ_MASK_VdqHdqWdqR(bxInstruction_c * i)864 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
865 {
866 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
867 unsigned len = i->getVL();
868
869 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
870 unsigned count = op2.vmm64u(n) & 63;
871 if (count) {
872 dst.vmm64u(n) = (dst.vmm64u(n) >> count) | (op1.vmm64u(n) << (64 - count));
873 }
874 }
875
876 if (i->opmask())
877 avx512_write_regq_masked(i, &dst, len, BX_READ_8BIT_OPMASK(i->opmask()));
878 else
879 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
880
881 BX_NEXT_INSTR(i);
882 }
883
884
885 // absolute value
886
VPABSB_MASK_VdqWdqR(bxInstruction_c * i)887 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSB_MASK_VdqWdqR(bxInstruction_c *i)
888 {
889 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
890 unsigned len = i->getVL();
891
892 for (unsigned n=0; n < len; n++)
893 xmm_pabsb(&op.vmm128(n));
894
895 avx512_write_regb_masked(i, &op, len, BX_READ_OPMASK(i->opmask()));
896 BX_NEXT_INSTR(i);
897 }
898
VPABSW_MASK_VdqWdqR(bxInstruction_c * i)899 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSW_MASK_VdqWdqR(bxInstruction_c *i)
900 {
901 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
902 unsigned len = i->getVL();
903
904 for (unsigned n=0; n < len; n++)
905 xmm_pabsw(&op.vmm128(n));
906
907 avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask()));
908 BX_NEXT_INSTR(i);
909 }
910
VPABSD_MASK_VdqWdqR(bxInstruction_c * i)911 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSD_MASK_VdqWdqR(bxInstruction_c *i)
912 {
913 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
914 unsigned len = i->getVL();
915
916 for (unsigned n=0; n < len; n++)
917 xmm_pabsd(&op.vmm128(n));
918
919 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
920 BX_NEXT_INSTR(i);
921 }
922
VPABSQ_MASK_VdqWdqR(bxInstruction_c * i)923 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSQ_MASK_VdqWdqR(bxInstruction_c *i)
924 {
925 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
926 unsigned len = i->getVL();
927
928 for (unsigned n=0; n < len; n++)
929 xmm_pabsq(&op.vmm128(n));
930
931 avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
932 BX_NEXT_INSTR(i);
933 }
934
935 // shuffle and permute
936
VPSHUFHW_MASK_VdqWdqIbR(bxInstruction_c * i)937 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFHW_MASK_VdqWdqIbR(bxInstruction_c *i)
938 {
939 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
940 Bit8u order = i->Ib();
941 unsigned len = i->getVL();
942
943 for (unsigned n=0; n < len; n++)
944 xmm_pshufhw(&result.vmm128(n), &op.vmm128(n), order);
945
946 avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
947 BX_NEXT_INSTR(i);
948 }
949
VPSHUFLW_MASK_VdqWdqIbR(bxInstruction_c * i)950 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFLW_MASK_VdqWdqIbR(bxInstruction_c *i)
951 {
952 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
953 Bit8u order = i->Ib();
954 unsigned len = i->getVL();
955
956 for (unsigned n=0; n < len; n++)
957 xmm_pshuflw(&result.vmm128(n), &op.vmm128(n), order);
958
959 avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
960 BX_NEXT_INSTR(i);
961 }
962
VPSHUFB_MASK_VdqHdqWdqR(bxInstruction_c * i)963 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFB_MASK_VdqHdqWdqR(bxInstruction_c *i)
964 {
965 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
966 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
967 unsigned len = i->getVL();
968
969 for (unsigned n=0; n < len; n++)
970 xmm_pshufb(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
971
972 avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
973 BX_NEXT_INSTR(i);
974 }
975
VSHUFPS_MASK_VpsHpsWpsIbR(bxInstruction_c * i)976 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFPS_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
977 {
978 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
979 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
980 unsigned len = i->getVL();
981
982 for (unsigned n=0; n < len; n++)
983 xmm_shufps(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n), i->Ib());
984
985 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
986 BX_NEXT_INSTR(i);
987 }
988
VSHUFPD_MASK_VpdHpdWpdIbR(bxInstruction_c * i)989 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFPD_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
990 {
991 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
992 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
993
994 unsigned len = i->getVL();
995 Bit8u order = i->Ib();
996
997 for (unsigned n=0; n < len; n++) {
998 xmm_shufpd(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n), order);
999 order >>= 2;
1000 }
1001
1002 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1003 BX_NEXT_INSTR(i);
1004 }
1005
VPERMILPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1006 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1007 {
1008 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1009 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1010 unsigned len = i->getVL();
1011
1012 for (unsigned n=0; n < len; n++)
1013 xmm_permilps(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
1014
1015 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1016 BX_NEXT_INSTR(i);
1017 }
1018
VPERMILPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1019 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1020 {
1021 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1022 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1023 unsigned len = i->getVL();
1024
1025 for (unsigned n=0; n < len; n++)
1026 xmm_permilpd(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
1027
1028 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1029 BX_NEXT_INSTR(i);
1030 }
1031
VPERMILPS_MASK_VpsWpsIbR(bxInstruction_c * i)1032 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPS_MASK_VpsWpsIbR(bxInstruction_c *i)
1033 {
1034 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src()), result;
1035 unsigned len = i->getVL();
1036
1037 for (unsigned n=0; n < len; n++)
1038 xmm_shufps(&result.vmm128(n), &op1.vmm128(n), &op1.vmm128(n), i->Ib());
1039
1040 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1041 BX_NEXT_INSTR(i);
1042 }
1043
VPERMILPD_MASK_VpdWpdIbR(bxInstruction_c * i)1044 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPD_MASK_VpdWpdIbR(bxInstruction_c *i)
1045 {
1046 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src()), result;
1047 unsigned len = i->getVL();
1048 Bit8u order = i->Ib();
1049
1050 for (unsigned n=0; n < len; n++) {
1051 xmm_shufpd(&result.vmm128(n), &op1.vmm128(n), &op1.vmm128(n), order);
1052 order >>= 2;
1053 }
1054
1055 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1056 BX_NEXT_INSTR(i);
1057 }
1058
VSHUFF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1059 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1060 {
1061 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1062 unsigned len = i->getVL();
1063 Bit8u order = i->Ib();
1064
1065 if (len == BX_VL256) {
1066 result.vmm128(0) = op1.vmm128(order & 0x1);
1067 result.vmm128(1) = op2.vmm128((order>>1) & 0x1);
1068 }
1069 else {
1070 result.vmm128(0) = op1.vmm128(order & 0x3);
1071 result.vmm128(1) = op1.vmm128((order>>2) & 0x3);
1072 result.vmm128(2) = op2.vmm128((order>>4) & 0x3);
1073 result.vmm128(3) = op2.vmm128((order>>6) & 0x3);
1074 }
1075
1076 if (i->opmask()) {
1077 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1078 }
1079 else {
1080 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1081 }
1082
1083 BX_NEXT_INSTR(i);
1084 }
1085
VSHUFF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1086 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1087 {
1088 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1089 unsigned len = i->getVL();
1090 Bit8u order = i->Ib();
1091
1092 if (len == BX_VL256) {
1093 result.vmm128(0) = op1.vmm128(order & 0x1);
1094 result.vmm128(1) = op2.vmm128((order>>1) & 0x1);
1095 }
1096 else {
1097 result.vmm128(0) = op1.vmm128(order & 0x3);
1098 result.vmm128(1) = op1.vmm128((order>>2) & 0x3);
1099 result.vmm128(2) = op2.vmm128((order>>4) & 0x3);
1100 result.vmm128(3) = op2.vmm128((order>>6) & 0x3);
1101 }
1102
1103 if (i->opmask()) {
1104 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1105 }
1106 else {
1107 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1108 }
1109
1110 BX_NEXT_INSTR(i);
1111 }
1112
VPALIGNR_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1113 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPALIGNR_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1114 {
1115 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1116 unsigned len = i->getVL();
1117
1118 for (unsigned n=0; n<len; n++)
1119 xmm_palignr(&op2.vmm128(n), &op1.vmm128(n), i->Ib());
1120
1121 avx512_write_regb_masked(i, &op2, len, BX_READ_OPMASK(i->opmask()));
1122
1123 BX_NEXT_INSTR(i);
1124 }
1125
VALIGND_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1126 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VALIGND_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1127 {
1128 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1129 unsigned len = i->getVL(), elements_mask = DWORD_ELEMENTS(len) - 1;
1130 unsigned shift = i->Ib() & elements_mask;
1131
1132 for (unsigned n=0; n <= elements_mask; n++) {
1133 unsigned index = (shift + n) & elements_mask;
1134 result.vmm32u(n) = ((n + shift) <= elements_mask) ? op2.vmm32u(index) : op1.vmm32u(index);
1135 }
1136
1137 if (i->opmask()) {
1138 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1139 }
1140 else {
1141 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1142 }
1143
1144 BX_NEXT_INSTR(i);
1145 }
1146
VALIGNQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1147 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VALIGNQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1148 {
1149 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1150 unsigned len = i->getVL(), elements_mask = QWORD_ELEMENTS(len) - 1;
1151 unsigned shift = i->Ib() & elements_mask;
1152
1153 for (unsigned n=0; n <= elements_mask; n++) {
1154 unsigned index = (shift + n) & elements_mask;
1155 result.vmm64u(n) = ((n + shift) <= elements_mask) ? op2.vmm64u(index) : op1.vmm64u(index);
1156 }
1157
1158 if (i->opmask()) {
1159 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1160 }
1161 else {
1162 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1163 }
1164
1165 BX_NEXT_INSTR(i);
1166 }
1167
VPERMQ_MASK_VdqWdqIbR(bxInstruction_c * i)1168 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMQ_MASK_VdqWdqIbR(bxInstruction_c *i)
1169 {
1170 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1171 Bit8u control = i->Ib();
1172 unsigned len = i->getVL();
1173
1174 ymm_vpermq(&result.vmm256(0), &op.vmm256(0), control);
1175 if (len == BX_VL512)
1176 ymm_vpermq(&result.vmm256(1), &op.vmm256(1), control);
1177
1178 if (i->opmask()) {
1179 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1180 }
1181 else {
1182 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1183 }
1184
1185 BX_NEXT_INSTR(i);
1186 }
1187
VPERMT2B_MASK_VdqHdqWdqR(bxInstruction_c * i)1188 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2B_MASK_VdqHdqWdqR(bxInstruction_c *i)
1189 {
1190 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1191 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1192 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1193 unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1194 unsigned shuffle_control_mask = elements - 1;
1195
1196 for (unsigned n=0; n < elements; n++) {
1197 unsigned shuffle_control = (unsigned) (op1.vmmubyte(n) & shuffle_control_mask);
1198 result.vmmubyte(n) = (op1.vmmubyte(n) & elements) ? op2.vmmubyte(shuffle_control) : dst.vmmubyte(shuffle_control);
1199 }
1200
1201 if (i->opmask()) {
1202 avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1203 }
1204 else {
1205 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1206 }
1207
1208 BX_NEXT_INSTR(i);
1209 }
1210
VPERMT2W_MASK_VdqHdqWdqR(bxInstruction_c * i)1211 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2W_MASK_VdqHdqWdqR(bxInstruction_c *i)
1212 {
1213 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1214 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1215 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1216 unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1217 unsigned shuffle_control_mask = elements - 1;
1218
1219 for (unsigned n=0; n < elements; n++) {
1220 unsigned shuffle_control = (unsigned) (op1.vmm16u(n) & shuffle_control_mask);
1221 result.vmm16u(n) = (op1.vmm16u(n) & elements) ? op2.vmm16u(shuffle_control) : dst.vmm16u(shuffle_control);
1222 }
1223
1224 if (i->opmask()) {
1225 avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1226 }
1227 else {
1228 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1229 }
1230
1231 BX_NEXT_INSTR(i);
1232 }
1233
VPERMT2PS_MASK_VpsHpsWpsR(bxInstruction_c * i)1234 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2PS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1235 {
1236 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1237 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1238 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1239 unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1240 unsigned shuffle_control_mask = elements - 1;
1241
1242 for (unsigned n=0; n < elements; n++) {
1243 unsigned shuffle_control = (unsigned) (op1.vmm32u(n) & shuffle_control_mask);
1244 result.vmm32u(n) = (op1.vmm32u(n) & elements) ? op2.vmm32u(shuffle_control) : dst.vmm32u(shuffle_control);
1245 }
1246
1247 if (i->opmask()) {
1248 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1249 }
1250 else {
1251 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1252 }
1253
1254 BX_NEXT_INSTR(i);
1255 }
1256
VPERMT2PD_MASK_VpdHpdWpdR(bxInstruction_c * i)1257 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2PD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1258 {
1259 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1260 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1261 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1262 unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1263 unsigned shuffle_control_mask = elements - 1;
1264
1265 for (unsigned n=0; n < elements; n++) {
1266 unsigned shuffle_control = (unsigned) (op1.vmm64u(n) & shuffle_control_mask);
1267 result.vmm64u(n) = (op1.vmm64u(n) & elements) ? op2.vmm64u(shuffle_control) : dst.vmm64u(shuffle_control);
1268 }
1269
1270 if (i->opmask()) {
1271 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1272 }
1273 else {
1274 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1275 }
1276
1277 BX_NEXT_INSTR(i);
1278 }
1279
VPERMI2B_MASK_VdqHdqWdqR(bxInstruction_c * i)1280 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2B_MASK_VdqHdqWdqR(bxInstruction_c *i)
1281 {
1282 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1283 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1284 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1285 unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1286 unsigned shuffle_control_mask = elements - 1;
1287
1288 for (unsigned n=0; n < elements; n++) {
1289 unsigned shuffle_control = (unsigned) (dst.vmmubyte(n) & shuffle_control_mask);
1290 result.vmmubyte(n) = (dst.vmmubyte(n) & elements) ? op2.vmmubyte(shuffle_control) : op1.vmmubyte(shuffle_control);
1291 }
1292
1293 if (i->opmask()) {
1294 avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1295 }
1296 else {
1297 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1298 }
1299
1300 BX_NEXT_INSTR(i);
1301 }
1302
VPERMI2W_MASK_VdqHdqWdqR(bxInstruction_c * i)1303 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2W_MASK_VdqHdqWdqR(bxInstruction_c *i)
1304 {
1305 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1306 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1307 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1308 unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1309 unsigned shuffle_control_mask = elements - 1;
1310
1311 for (unsigned n=0; n < elements; n++) {
1312 unsigned shuffle_control = (unsigned) (dst.vmm16u(n) & shuffle_control_mask);
1313 result.vmm16u(n) = (dst.vmm16u(n) & elements) ? op2.vmm16u(shuffle_control) : op1.vmm16u(shuffle_control);
1314 }
1315
1316 if (i->opmask()) {
1317 avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1318 }
1319 else {
1320 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1321 }
1322
1323 BX_NEXT_INSTR(i);
1324 }
1325
VPERMI2PS_MASK_VpsHpsWpsR(bxInstruction_c * i)1326 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2PS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1327 {
1328 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1329 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1330 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1331 unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1332 unsigned shuffle_control_mask = elements - 1;
1333
1334 for (unsigned n=0; n < elements; n++) {
1335 unsigned shuffle_control = (unsigned) (dst.vmm32u(n) & shuffle_control_mask);
1336 result.vmm32u(n) = (dst.vmm32u(n) & elements) ? op2.vmm32u(shuffle_control) : op1.vmm32u(shuffle_control);
1337 }
1338
1339 if (i->opmask()) {
1340 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1341 }
1342 else {
1343 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1344 }
1345
1346 BX_NEXT_INSTR(i);
1347 }
1348
VPERMI2PD_MASK_VpdHpdWpdR(bxInstruction_c * i)1349 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2PD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1350 {
1351 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1352 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1353 BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1354 unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1355 unsigned shuffle_control_mask = elements - 1;
1356
1357 for (unsigned n=0; n < elements; n++) {
1358 unsigned shuffle_control = (unsigned) (dst.vmm64u(n) & shuffle_control_mask);
1359 result.vmm64u(n) = (dst.vmm64u(n) & elements) ? op2.vmm64u(shuffle_control) : op1.vmm64u(shuffle_control);
1360 }
1361
1362 if (i->opmask()) {
1363 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1364 }
1365 else {
1366 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1367 }
1368
1369 BX_NEXT_INSTR(i);
1370 }
1371
VPERMB_MASK_VdqHdqWdqR(bxInstruction_c * i)1372 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMB_MASK_VdqHdqWdqR(bxInstruction_c *i)
1373 {
1374 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1375 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1376 unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1377 unsigned shuffle_control_mask = elements - 1;
1378
1379 for (unsigned n=0;n < elements;n++)
1380 result.vmmubyte(n) = op2.vmmubyte(op1.vmmubyte(n) & shuffle_control_mask);
1381
1382 if (i->opmask()) {
1383 avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1384 }
1385 else {
1386 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1387 }
1388
1389 BX_NEXT_INSTR(i);
1390 }
1391
VPERMW_MASK_VdqHdqWdqR(bxInstruction_c * i)1392 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMW_MASK_VdqHdqWdqR(bxInstruction_c *i)
1393 {
1394 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1395 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1396 unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1397 unsigned shuffle_control_mask = elements - 1;
1398
1399 for (unsigned n=0;n < elements;n++)
1400 result.vmm16u(n) = op2.vmm16u(op1.vmm16u(n) & shuffle_control_mask);
1401
1402 if (i->opmask()) {
1403 avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1404 }
1405 else {
1406 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1407 }
1408
1409 BX_NEXT_INSTR(i);
1410 }
1411
VPERMPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1412 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1413 {
1414 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1415 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1416 unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1417 unsigned shuffle_control_mask = elements - 1;
1418
1419 for (unsigned n=0;n < elements;n++)
1420 result.vmm32u(n) = op2.vmm32u(op1.vmm32u(n) & shuffle_control_mask);
1421
1422 if (i->opmask()) {
1423 avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1424 }
1425 else {
1426 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1427 }
1428
1429 BX_NEXT_INSTR(i);
1430 }
1431
VPERMPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1432 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1433 {
1434 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1435 BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1436 unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1437 unsigned shuffle_control_mask = elements - 1;
1438
1439 for (unsigned n=0;n < elements;n++)
1440 result.vmm64u(n) = op2.vmm64u(op1.vmm64u(n) & shuffle_control_mask);
1441
1442 if (i->opmask()) {
1443 avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1444 }
1445 else {
1446 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1447 }
1448
1449 BX_NEXT_INSTR(i);
1450 }
1451
VINSERTF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1452 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1453 {
1454 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1455 unsigned len = i->getVL();
1456 unsigned offset = i->Ib() & (len-1);
1457
1458 op.vmm128(offset) = BX_READ_XMM_REG(i->src2());
1459 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1460 BX_NEXT_INSTR(i);
1461 }
1462
VINSERTF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1463 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1464 {
1465 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1466 unsigned len = i->getVL();
1467 unsigned offset = i->Ib() & (len-1);
1468
1469 op.vmm128(offset) = BX_READ_XMM_REG(i->src2());
1470 avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
1471
1472 BX_NEXT_INSTR(i);
1473 }
1474
VINSERTF64x4_VpdHpdWpdIbR(bxInstruction_c * i)1475 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x4_VpdHpdWpdIbR(bxInstruction_c *i)
1476 {
1477 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1478 op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1479 BX_WRITE_AVX_REGZ(i->dst(), op, BX_VL512);
1480
1481 BX_NEXT_INSTR(i);
1482 }
1483
VINSERTF64x4_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1484 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x4_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1485 {
1486 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1487 op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1488 avx512_write_regq_masked(i, &op, BX_VL512, BX_READ_8BIT_OPMASK(i->opmask()));
1489
1490 BX_NEXT_INSTR(i);
1491 }
1492
VINSERTF32x8_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1493 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF32x8_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1494 {
1495 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1496 op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1497 avx512_write_regd_masked(i, &op, BX_VL512, BX_READ_16BIT_OPMASK(i->opmask()));
1498
1499 BX_NEXT_INSTR(i);
1500 }
1501
VEXTRACTF32x4_MASK_WpsVpsIbR(bxInstruction_c * i)1502 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x4_MASK_WpsVpsIbR(bxInstruction_c *i)
1503 {
1504 unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1505 BxPackedXmmRegister op = BX_READ_AVX_REG_LANE(i->src(), offset);
1506
1507 Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
1508
1509 if (i->isZeroMasking())
1510 xmm_zero_blendps(&BX_READ_XMM_REG(i->dst()), &op, mask);
1511 else
1512 xmm_blendps(&BX_READ_XMM_REG(i->dst()), &op, mask);
1513
1514 BX_CLEAR_AVX_HIGH128(i->dst());
1515 BX_NEXT_INSTR(i);
1516 }
1517
VEXTRACTF32x4_MASK_WpsVpsIbM(bxInstruction_c * i)1518 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x4_MASK_WpsVpsIbM(bxInstruction_c *i)
1519 {
1520 unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1521 BxPackedAvxRegister op;
1522 op.vmm128(0) = BX_READ_AVX_REG_LANE(i->src(), offset);
1523
1524 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0xf;
1525 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1526 avx_masked_store32(i, eaddr, &op, opmask);
1527
1528 BX_NEXT_INSTR(i);
1529 }
1530
VEXTRACTF64x2_MASK_WpdVpdIbR(bxInstruction_c * i)1531 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x2_MASK_WpdVpdIbR(bxInstruction_c *i)
1532 {
1533 unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1534 BxPackedXmmRegister op = BX_READ_AVX_REG_LANE(i->src(), offset);
1535
1536 Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
1537
1538 if (i->isZeroMasking())
1539 xmm_zero_blendpd(&BX_READ_XMM_REG(i->dst()), &op, mask);
1540 else
1541 xmm_blendpd(&BX_READ_XMM_REG(i->dst()), &op, mask);
1542
1543 BX_CLEAR_AVX_HIGH128(i->dst());
1544 BX_NEXT_INSTR(i);
1545 }
1546
VEXTRACTF64x2_MASK_WpdVpdIbM(bxInstruction_c * i)1547 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x2_MASK_WpdVpdIbM(bxInstruction_c *i)
1548 {
1549 unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1550 BxPackedAvxRegister op;
1551 op.vmm128(0) = BX_READ_AVX_REG_LANE(i->src(), offset);
1552
1553 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0x3;
1554 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1555 avx_masked_store64(i, eaddr, &op, opmask);
1556
1557 BX_NEXT_INSTR(i);
1558 }
1559
VEXTRACTF64x4_WpdVpdIbR(bxInstruction_c * i)1560 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_WpdVpdIbR(bxInstruction_c *i)
1561 {
1562 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1563 BX_WRITE_YMM_REGZ(i->dst(), op.vmm256(i->Ib() & 0x1));
1564 BX_NEXT_INSTR(i);
1565 }
1566
VEXTRACTF64x4_MASK_WpdVpdIbR(bxInstruction_c * i)1567 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_MASK_WpdVpdIbR(bxInstruction_c *i)
1568 {
1569 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1570 if (i->Ib() & 0x1)
1571 op.vmm256(0) = op.vmm256(1);
1572
1573 avx512_write_regq_masked(i, &op, BX_VL256, BX_READ_8BIT_OPMASK(i->opmask()));
1574 BX_NEXT_INSTR(i);
1575 }
1576
VEXTRACTF64x4_WpdVpdIbM(bxInstruction_c * i)1577 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_WpdVpdIbM(bxInstruction_c *i)
1578 {
1579 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1580 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1581 write_virtual_ymmword(i->seg(), eaddr, &op.vmm256(i->Ib() & 0x1));
1582 BX_NEXT_INSTR(i);
1583 }
1584
VEXTRACTF64x4_MASK_WpdVpdIbM(bxInstruction_c * i)1585 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_MASK_WpdVpdIbM(bxInstruction_c *i)
1586 {
1587 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1588 if (i->Ib() & 0x1)
1589 op.vmm256(0) = op.vmm256(1);
1590
1591 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0xf;
1592
1593 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1594 avx_masked_store64(i, eaddr, &op, opmask);
1595 BX_NEXT_INSTR(i);
1596 }
1597
VEXTRACTF32x8_MASK_WpsVpsIbR(bxInstruction_c * i)1598 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x8_MASK_WpsVpsIbR(bxInstruction_c *i)
1599 {
1600 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1601 if (i->Ib() & 0x1)
1602 op.vmm256(0) = op.vmm256(1);
1603
1604 avx512_write_regd_masked(i, &op, BX_VL256, BX_READ_8BIT_OPMASK(i->opmask()));
1605 BX_NEXT_INSTR(i);
1606 }
1607
VEXTRACTF32x8_MASK_WpsVpsIbM(bxInstruction_c * i)1608 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x8_MASK_WpsVpsIbM(bxInstruction_c *i)
1609 {
1610 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1611 if (i->Ib() & 0x1)
1612 op.vmm256(0) = op.vmm256(1);
1613
1614 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
1615 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1616 avx_masked_store32(i, eaddr, &op, opmask);
1617 BX_NEXT_INSTR(i);
1618 }
1619
VMOVDDUP_MASK_VpdWpdR(bxInstruction_c * i)1620 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVDDUP_MASK_VpdWpdR(bxInstruction_c *i)
1621 {
1622 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1623 unsigned len = i->getVL();
1624
1625 for (unsigned n=0; n < QWORD_ELEMENTS(len); n+=2) {
1626 op.vmm64u(n+1) = op.vmm64u(n);
1627 }
1628
1629 avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
1630
1631 BX_NEXT_INSTR(i);
1632 }
1633
VMOVSLDUP_MASK_VpsWpsR(bxInstruction_c * i)1634 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVSLDUP_MASK_VpsWpsR(bxInstruction_c *i)
1635 {
1636 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1637 unsigned len = i->getVL();
1638
1639 for (unsigned n=0; n < DWORD_ELEMENTS(len); n+=2) {
1640 op.vmm32u(n+1) = op.vmm32u(n);
1641 }
1642
1643 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1644
1645 BX_NEXT_INSTR(i);
1646 }
1647
VMOVSHDUP_MASK_VpsWpsR(bxInstruction_c * i)1648 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVSHDUP_MASK_VpsWpsR(bxInstruction_c *i)
1649 {
1650 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1651 unsigned len = i->getVL();
1652
1653 for (unsigned n=0; n < DWORD_ELEMENTS(len); n+=2) {
1654 op.vmm32u(n) = op.vmm32u(n+1);
1655 }
1656
1657 avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1658
1659 BX_NEXT_INSTR(i);
1660 }
1661
1662 // special bit operations
1663
ternlogd_scalar(Bit32u op1,Bit32u op2,Bit32u op3,unsigned imm8)1664 BX_CPP_INLINE Bit32u ternlogd_scalar(Bit32u op1, Bit32u op2, Bit32u op3, unsigned imm8)
1665 {
1666 Bit32u result = 0;
1667
1668 for (unsigned bit = 0; bit < 32; bit++) {
1669 unsigned tmp = (op1 >> bit) & 0x1;
1670 tmp <<= 1;
1671 tmp |= (op2 >> bit) & 0x1;
1672 tmp <<= 1;
1673 tmp |= (op3 >> bit) & 0x1;
1674
1675 result |= ((Bit32u)((imm8 >> tmp) & 0x1)) << bit;
1676 }
1677
1678 return result;
1679 }
1680
ternlogq_scalar(Bit64u op1,Bit64u op2,Bit64u op3,unsigned imm8)1681 BX_CPP_INLINE Bit64u ternlogq_scalar(Bit64u op1, Bit64u op2, Bit64u op3, unsigned imm8)
1682 {
1683 Bit64u result = 0;
1684
1685 for (unsigned bit = 0; bit < 64; bit++) {
1686 unsigned tmp = (op1 >> bit) & 0x1;
1687 tmp <<= 1;
1688 tmp |= (op2 >> bit) & 0x1;
1689 tmp <<= 1;
1690 tmp |= (op3 >> bit) & 0x1;
1691
1692 result |= ((Bit64u)((imm8 >> tmp) & 0x1)) << bit;
1693 }
1694
1695 return result;
1696 }
1697
VPTERNLOGD_VdqHdqWdqIbR(bxInstruction_c * i)1698 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGD_VdqHdqWdqIbR(bxInstruction_c *i)
1699 {
1700 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1701 op2 = BX_READ_AVX_REG(i->src1()),
1702 op3 = BX_READ_AVX_REG(i->src2());
1703
1704 unsigned len = i->getVL(), num_elements = DWORD_ELEMENTS(len);
1705 Bit8u imm8 = i->Ib();
1706
1707 for (unsigned n=0; n < num_elements; n++) {
1708 op1.vmm32u(n) = ternlogd_scalar(op1.vmm32u(n), op2.vmm32u(n), op3.vmm32u(n), imm8);
1709 }
1710
1711 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1712 BX_NEXT_INSTR(i);
1713 }
1714
VPTERNLOGD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1715 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1716 {
1717 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1718 op2 = BX_READ_AVX_REG(i->src1()),
1719 op3 = BX_READ_AVX_REG(i->src2());
1720
1721 unsigned len = i->getVL(), num_elements = DWORD_ELEMENTS(len);
1722 Bit8u imm8 = i->Ib();
1723
1724 Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask());
1725
1726 for (unsigned n=0; n < num_elements; n++, opmask >>= 1) {
1727 if (opmask & 0x1)
1728 op1.vmm32u(n) = ternlogd_scalar(op1.vmm32u(n), op2.vmm32u(n), op3.vmm32u(n), imm8);
1729 else
1730 if (i->isZeroMasking()) op1.vmm32u(n) = 0;
1731 }
1732
1733 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1734 BX_NEXT_INSTR(i);
1735 }
1736
VPTERNLOGQ_VdqHdqWdqIbR(bxInstruction_c * i)1737 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGQ_VdqHdqWdqIbR(bxInstruction_c *i)
1738 {
1739 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1740 op2 = BX_READ_AVX_REG(i->src1()),
1741 op3 = BX_READ_AVX_REG(i->src2());
1742
1743 unsigned len = i->getVL(), num_elements = QWORD_ELEMENTS(len);
1744 Bit8u imm8 = i->Ib();
1745
1746 for (unsigned n=0; n < num_elements; n++) {
1747 op1.vmm64u(n) = ternlogq_scalar(op1.vmm64u(n), op2.vmm64u(n), op3.vmm64u(n), imm8);
1748 }
1749
1750 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1751 BX_NEXT_INSTR(i);
1752 }
1753
VPTERNLOGQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1754 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1755 {
1756 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1757 op2 = BX_READ_AVX_REG(i->src1()),
1758 op3 = BX_READ_AVX_REG(i->src2());
1759
1760 unsigned len = i->getVL(), num_elements = QWORD_ELEMENTS(len);
1761 Bit8u imm8 = i->Ib();
1762
1763 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
1764
1765 for (unsigned n=0; n < num_elements; n++, opmask >>= 1) {
1766 if (opmask & 0x1)
1767 op1.vmm64u(n) = ternlogq_scalar(op1.vmm64u(n), op2.vmm64u(n), op3.vmm64u(n), imm8);
1768 else
1769 if (i->isZeroMasking()) op1.vmm64u(n) = 0;
1770 }
1771
1772 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1773 BX_NEXT_INSTR(i);
1774 }
1775
1776 // blend
1777
VPBLENDMB_MASK_VdqHdqWdqR(bxInstruction_c * i)1778 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPBLENDMB_MASK_VdqHdqWdqR(bxInstruction_c *i)
1779 {
1780 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1781 unsigned len = i->getVL();
1782
1783 Bit64u opmask = (i->opmask() != 0) ? BX_READ_OPMASK(i->opmask()) : BX_CONST64(0xffffffffffffffff);
1784
1785 if (i->isZeroMasking()) {
1786 for (unsigned n=0; n < len; n++, opmask >>= 16)
1787 xmm_zero_pblendb(&op1.vmm128(n), &op2.vmm128(n), opmask);
1788 }
1789 else {
1790 for (unsigned n=0; n < len; n++, opmask >>= 16)
1791 xmm_pblendb(&op1.vmm128(n), &op2.vmm128(n), opmask);
1792 }
1793
1794 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1795 BX_NEXT_INSTR(i);
1796 }
1797
VPBLENDMW_MASK_VdqHdqWdqR(bxInstruction_c * i)1798 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPBLENDMW_MASK_VdqHdqWdqR(bxInstruction_c *i)
1799 {
1800 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1801 unsigned len = i->getVL();
1802
1803 Bit32u opmask = (i->opmask() != 0) ? BX_READ_32BIT_OPMASK(i->opmask()) : 0xffffffff;
1804
1805 if (i->isZeroMasking()) {
1806 for (unsigned n=0; n < len; n++, opmask >>= 8)
1807 xmm_zero_pblendw(&op1.vmm128(n), &op2.vmm128(n), opmask);
1808 }
1809 else {
1810 for (unsigned n=0; n < len; n++, opmask >>= 8)
1811 xmm_pblendw(&op1.vmm128(n), &op2.vmm128(n), opmask);
1812 }
1813
1814 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1815 BX_NEXT_INSTR(i);
1816 }
1817
VBLENDMPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1818 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VBLENDMPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1819 {
1820 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1821 unsigned len = i->getVL();
1822
1823 Bit32u opmask = (i->opmask() != 0) ? BX_READ_16BIT_OPMASK(i->opmask()) : 0xffff;
1824
1825 if (i->isZeroMasking()) {
1826 for (unsigned n=0; n < len; n++, opmask >>= 4)
1827 xmm_zero_blendps(&op1.vmm128(n), &op2.vmm128(n), opmask);
1828 }
1829 else {
1830 for (unsigned n=0; n < len; n++, opmask >>= 4)
1831 xmm_blendps(&op1.vmm128(n), &op2.vmm128(n), opmask);
1832 }
1833
1834 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1835 BX_NEXT_INSTR(i);
1836 }
1837
VBLENDMPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1838 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VBLENDMPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1839 {
1840 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1841 unsigned len = i->getVL();
1842
1843 Bit32u opmask = (i->opmask() != 0) ? BX_READ_8BIT_OPMASK(i->opmask()) : 0xff;
1844
1845 if (i->isZeroMasking()) {
1846 for (unsigned n=0; n < len; n++, opmask >>= 2)
1847 xmm_zero_blendpd(&op1.vmm128(n), &op2.vmm128(n), opmask);
1848 }
1849 else {
1850 for (unsigned n=0; n < len; n++, opmask >>= 2)
1851 xmm_blendpd(&op1.vmm128(n), &op2.vmm128(n), opmask);
1852 }
1853
1854 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1855 BX_NEXT_INSTR(i);
1856 }
1857
1858 // compress, expand
1859
VPEXPANDB_MASK_VdqWdqR(bxInstruction_c * i)1860 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDB_MASK_VdqWdqR(bxInstruction_c *i)
1861 {
1862 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1863
1864 Bit64u opmask = BX_READ_OPMASK(i->opmask()), mask = opmask;
1865 unsigned len = i->getVL(), n = 0, k = 0;
1866
1867 for (; n < len*16; n++, mask >>= 1) {
1868 if (mask & 0x1) {
1869 result.vmmubyte(n) = op.vmmubyte(k);
1870 k++;
1871 }
1872 else {
1873 result.vmmubyte(n) = 0;
1874 }
1875 }
1876
1877 if (i->isZeroMasking()) {
1878 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1879 }
1880 else {
1881 for (unsigned n=0; n < len; n++, opmask >>= 16)
1882 xmm_pblendb(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1883
1884 BX_CLEAR_AVX_REGZ(i->dst(), len);
1885 }
1886
1887 BX_NEXT_INSTR(i);
1888 }
1889
VPEXPANDW_MASK_VdqWdqR(bxInstruction_c * i)1890 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDW_MASK_VdqWdqR(bxInstruction_c *i)
1891 {
1892 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1893
1894 Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask()), mask = opmask;
1895 unsigned len = i->getVL(), n = 0, k = 0;
1896
1897 for (; n < len*8; n++, mask >>= 1) {
1898 if (mask & 0x1) {
1899 result.vmm16u(n) = op.vmm16u(k);
1900 k++;
1901 }
1902 else {
1903 result.vmm16u(n) = 0;
1904 }
1905 }
1906
1907 if (i->isZeroMasking()) {
1908 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1909 }
1910 else {
1911 for (unsigned n=0; n < len; n++, opmask >>= 8)
1912 xmm_pblendw(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1913
1914 BX_CLEAR_AVX_REGZ(i->dst(), len);
1915 }
1916
1917 BX_NEXT_INSTR(i);
1918 }
1919
VEXPANDPS_MASK_VpsWpsR(bxInstruction_c * i)1920 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPS_MASK_VpsWpsR(bxInstruction_c *i)
1921 {
1922 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1923
1924 Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask()), mask = opmask;
1925 unsigned len = i->getVL(), n = 0, k = 0;
1926
1927 for (; n < len*4; n++, mask >>= 1) {
1928 if (mask & 0x1) {
1929 result.vmm32u(n) = op.vmm32u(k);
1930 k++;
1931 }
1932 else {
1933 result.vmm32u(n) = 0;
1934 }
1935 }
1936
1937 if (i->isZeroMasking()) {
1938 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1939 }
1940 else {
1941 for (unsigned n=0; n < len; n++, opmask >>= 4)
1942 xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1943
1944 BX_CLEAR_AVX_REGZ(i->dst(), len);
1945 }
1946
1947 BX_NEXT_INSTR(i);
1948 }
1949
VEXPANDPD_MASK_VpdWpdR(bxInstruction_c * i)1950 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPD_MASK_VpdWpdR(bxInstruction_c *i)
1951 {
1952 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1953
1954 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()), mask = opmask;
1955 unsigned len = i->getVL(), n = 0, k = 0;
1956
1957 for (; n < len*2; n++, mask >>= 1) {
1958 if (mask & 0x1) {
1959 result.vmm64u(n) = op.vmm64u(k);
1960 k++;
1961 }
1962 else {
1963 result.vmm64u(n) = 0;
1964 }
1965 }
1966
1967 if (i->isZeroMasking()) {
1968 BX_WRITE_AVX_REGZ(i->dst(), result, len);
1969 }
1970 else {
1971 for (unsigned n=0; n < len; n++, opmask >>= 2)
1972 xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1973
1974 BX_CLEAR_AVX_REGZ(i->dst(), len);
1975 }
1976
1977 BX_NEXT_INSTR(i);
1978 }
1979
VPCOMPRESSB_MASK_WdqVdq(bxInstruction_c * i)1980 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSB_MASK_WdqVdq(bxInstruction_c *i)
1981 {
1982 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1983
1984 Bit64u opmask = BX_READ_OPMASK(i->opmask());
1985 unsigned len = i->getVL(), n = 0, k = 0;
1986
1987 for (; n < len*16; n++, opmask >>= 1) {
1988 if (opmask & 0x1) {
1989 result.vmmubyte(k) = op.vmmubyte(n);
1990 k++;
1991 }
1992 if (! opmask) break;
1993 }
1994
1995 Bit64u writemask = (BX_CONST64(1) << k) - 1;
1996
1997 if (i->modC0()) {
1998 avx512_write_regb_masked(i, &result, len, writemask);
1999 }
2000 else {
2001 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2002 avx_masked_store8(i, eaddr, &result, writemask);
2003 }
2004
2005 BX_NEXT_INSTR(i);
2006 }
2007
VPCOMPRESSW_MASK_WdqVdq(bxInstruction_c * i)2008 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSW_MASK_WdqVdq(bxInstruction_c *i)
2009 {
2010 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2011
2012 Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask());
2013 unsigned len = i->getVL(), n = 0, k = 0;
2014
2015 for (; n < len*8; n++, opmask >>= 1) {
2016 if (opmask & 0x1) {
2017 result.vmm16u(k) = op.vmm16u(n);
2018 k++;
2019 }
2020 if (! opmask) break;
2021 }
2022
2023 Bit32u writemask = (1 << k) - 1;
2024
2025 if (i->modC0()) {
2026 avx512_write_regw_masked(i, &result, len, writemask);
2027 }
2028 else {
2029 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2030 avx_masked_store16(i, eaddr, &result, writemask);
2031 }
2032
2033 BX_NEXT_INSTR(i);
2034 }
2035
VCOMPRESSPS_MASK_WpsVps(bxInstruction_c * i)2036 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPS_MASK_WpsVps(bxInstruction_c *i)
2037 {
2038 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2039
2040 Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask());
2041 unsigned len = i->getVL(), n = 0, k = 0;
2042
2043 for (; n < len*4; n++, opmask >>= 1) {
2044 if (opmask & 0x1) {
2045 result.vmm32u(k) = op.vmm32u(n);
2046 k++;
2047 }
2048 if (! opmask) break;
2049 }
2050
2051 Bit32u writemask = (1 << k) - 1;
2052
2053 if (i->modC0()) {
2054 avx512_write_regd_masked(i, &result, len, writemask);
2055 }
2056 else {
2057 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2058 avx_masked_store32(i, eaddr, &result, writemask);
2059 }
2060
2061 BX_NEXT_INSTR(i);
2062 }
2063
VCOMPRESSPD_MASK_WpdVpd(bxInstruction_c * i)2064 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPD_MASK_WpdVpd(bxInstruction_c *i)
2065 {
2066 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2067
2068 Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
2069 unsigned len = i->getVL(), n = 0, k = 0;
2070
2071 for (; n < len*2; n++, opmask >>= 1) {
2072 if (opmask & 0x1) {
2073 result.vmm64u(k) = op.vmm64u(n);
2074 k++;
2075 }
2076 if (! opmask) break;
2077 }
2078
2079 Bit32u writemask = (1 << k) - 1;
2080
2081 if (i->modC0()) {
2082 avx512_write_regq_masked(i, &result, len, writemask);
2083 }
2084 else {
2085 bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2086 avx_masked_store64(i, eaddr, &result, writemask);
2087 }
2088
2089 BX_NEXT_INSTR(i);
2090 }
2091
2092 // convert mask
2093
VPMOVM2B_VdqKEqR(bxInstruction_c * i)2094 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2B_VdqKEqR(bxInstruction_c *i)
2095 {
2096 Bit64u opmask = BX_READ_OPMASK(i->src());
2097 unsigned len = i->getVL();
2098
2099 for (unsigned n=0; n<len; n++) {
2100 xmm_pmovm2b(&BX_READ_AVX_REG_LANE(i->dst(), n), (Bit32u) opmask);
2101 opmask >>= 16;
2102 }
2103
2104 BX_CLEAR_AVX_REGZ(i->dst(), len);
2105 BX_NEXT_INSTR(i);
2106 }
2107
VPMOVM2W_VdqKEdR(bxInstruction_c * i)2108 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2W_VdqKEdR(bxInstruction_c *i)
2109 {
2110 Bit32u opmask = BX_READ_32BIT_OPMASK(i->src());
2111 unsigned len = i->getVL();
2112
2113 for (unsigned n=0; n<len; n++) {
2114 xmm_pmovm2w(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2115 opmask >>= 8;
2116 }
2117
2118 BX_CLEAR_AVX_REGZ(i->dst(), len);
2119 BX_NEXT_INSTR(i);
2120 }
2121
VPMOVM2D_VdqKEwR(bxInstruction_c * i)2122 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2D_VdqKEwR(bxInstruction_c *i)
2123 {
2124 Bit32u opmask = (Bit32u) BX_READ_16BIT_OPMASK(i->src());
2125 unsigned len = i->getVL();
2126
2127 for (unsigned n=0; n<len; n++) {
2128 xmm_pmovm2d(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2129 opmask >>= 4;
2130 }
2131
2132 BX_CLEAR_AVX_REGZ(i->dst(), len);
2133 BX_NEXT_INSTR(i);
2134 }
2135
VPMOVM2Q_VdqKEbR(bxInstruction_c * i)2136 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2Q_VdqKEbR(bxInstruction_c *i)
2137 {
2138 Bit32u opmask = (Bit32u) BX_READ_8BIT_OPMASK(i->src());
2139 unsigned len = i->getVL();
2140
2141 for (unsigned n=0; n<len; n++) {
2142 xmm_pmovm2q(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2143 opmask >>= 2;
2144 }
2145
2146 BX_CLEAR_AVX_REGZ(i->dst(), len);
2147 BX_NEXT_INSTR(i);
2148 }
2149
VPMOVB2M_KGqWdqR(bxInstruction_c * i)2150 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVB2M_KGqWdqR(bxInstruction_c *i)
2151 {
2152 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2153 unsigned len = i->getVL();
2154 Bit64u mask = 0;
2155
2156 for (unsigned n=0; n<len; n++) {
2157 mask |= ((Bit64u) xmm_pmovmskb(&op.vmm128(n))) << (16*n);
2158 }
2159
2160 BX_WRITE_OPMASK(i->dst(), mask);
2161 BX_NEXT_INSTR(i);
2162 }
2163
VPMOVW2M_KGdWdqR(bxInstruction_c * i)2164 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVW2M_KGdWdqR(bxInstruction_c *i)
2165 {
2166 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2167 unsigned len = i->getVL();
2168 Bit32u mask = 0;
2169
2170 for (unsigned n=0; n<len; n++) {
2171 mask |= xmm_pmovmskw(&op.vmm128(n)) << (8*n);
2172 }
2173
2174 BX_WRITE_OPMASK(i->dst(), mask);
2175 BX_NEXT_INSTR(i);
2176 }
2177
VPMOVD2M_KGwWdqR(bxInstruction_c * i)2178 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVD2M_KGwWdqR(bxInstruction_c *i)
2179 {
2180 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2181 unsigned len = i->getVL();
2182 Bit32u mask = 0;
2183
2184 for (unsigned n=0; n<len; n++) {
2185 mask |= xmm_pmovmskd(&op.vmm128(n)) << (4*n);
2186 }
2187
2188 BX_WRITE_OPMASK(i->dst(), mask);
2189 BX_NEXT_INSTR(i);
2190 }
2191
VPMOVQ2M_KGbWdqR(bxInstruction_c * i)2192 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVQ2M_KGbWdqR(bxInstruction_c *i)
2193 {
2194 BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2195 unsigned len = i->getVL();
2196 Bit32u mask = 0;
2197
2198 for (unsigned n=0; n<len; n++) {
2199 mask |= xmm_pmovmskq(&op.vmm128(n)) << (2*n);
2200 }
2201
2202 BX_WRITE_OPMASK(i->dst(), mask);
2203 BX_NEXT_INSTR(i);
2204 }
2205
2206 // sad (sum of absolute differences)
2207
VDBPSADBW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)2208 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VDBPSADBW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
2209 {
2210 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst;
2211
2212 Bit32u opmask = i->opmask() ? BX_READ_32BIT_OPMASK(i->opmask()) : (Bit32u) -1;
2213 unsigned len = i->getVL();
2214
2215 for (unsigned n=0; n < len; n++) {
2216 BxPackedXmmRegister tmp;
2217 xmm_shufps(&tmp, &op2.vmm128(n), &op2.vmm128(n), i->Ib());
2218 xmm_dbpsadbw(&dst.vmm128(n), &op1.vmm128(n), &tmp);
2219 }
2220
2221 avx512_write_regw_masked(i, &dst, len, opmask);
2222
2223 BX_NEXT_INSTR(i);
2224 }
2225
2226 // multishift (VBMI)
2227
pmultishiftqb_scalar(Bit64u val_64,Bit64u control)2228 BX_CPP_INLINE Bit64u pmultishiftqb_scalar(Bit64u val_64, Bit64u control)
2229 {
2230 // use packed register as 64-bit value with convinient accessors
2231 BxPackedRegister result;
2232
2233 for (unsigned n=0; n < 8; n++, control >>= 8) {
2234 unsigned ctrl = (control & 0x3f);
2235 Bit64u tmp = val_64;
2236 if (ctrl != 0)
2237 tmp = (val_64 << (64 - ctrl)) | (val_64 >> ctrl);
2238 result.ubyte(n) = tmp & 0xff;
2239 }
2240
2241 return MMXUQ(result);
2242 }
2243
VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c * i)2244 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c *i)
2245 {
2246 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2247 unsigned len = i->getVL();
2248
2249 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2250 op1.vmm64u(n) = pmultishiftqb_scalar(op2.vmm64u(n), op1.vmm64u(n));
2251 }
2252
2253 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
2254 BX_NEXT_INSTR(i);
2255 }
2256
VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstruction_c * i)2257 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstruction_c *i)
2258 {
2259 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2260 Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2261 unsigned len = i->getVL();
2262
2263 for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2264 if (tmp_mask & 0x1)
2265 dst.vmm64u(n) = pmultishiftqb_scalar(op2.vmm64u(n), op1.vmm64u(n));
2266 else if (i->isZeroMasking())
2267 dst.vmm64u(n) = 0;
2268 }
2269
2270 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2271 BX_NEXT_INSTR(i);
2272 }
2273
2274 // 52-bit integer FMA
2275
pmadd52luq_scalar(Bit64u dst,Bit64u op1,Bit64u op2)2276 BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
2277 {
2278 op1 &= BX_CONST64(0x000fffffffffffff);
2279 op2 &= BX_CONST64(0x000fffffffffffff);
2280
2281 return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff));
2282 }
2283
pmadd52huq_scalar(Bit64u dst,Bit64u op1,Bit64u op2)2284 BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
2285 {
2286 op1 &= BX_CONST64(0x000fffffffffffff);
2287 op2 &= BX_CONST64(0x000fffffffffffff);
2288
2289 Bit128u product_128;
2290 long_mul(&product_128, op1, op2);
2291
2292 Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12);
2293
2294 return dst + temp;
2295 }
2296
VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c * i)2297 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i)
2298 {
2299 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2300 unsigned len = i->getVL();
2301
2302 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2303 dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2304 }
2305
2306 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2307 BX_NEXT_INSTR(i);
2308 }
2309
VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c * i)2310 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
2311 {
2312 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2313 Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2314 unsigned len = i->getVL();
2315
2316 for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2317 if (tmp_mask & 0x1)
2318 dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2319 else if (i->isZeroMasking())
2320 dst.vmm64u(n) = 0;
2321 }
2322
2323 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2324 BX_NEXT_INSTR(i);
2325 }
2326
VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c * i)2327 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i)
2328 {
2329 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2330 unsigned len = i->getVL();
2331
2332 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2333 dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2334 }
2335
2336 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2337 BX_NEXT_INSTR(i);
2338 }
2339
VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c * i)2340 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
2341 {
2342 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2343 Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2344 unsigned len = i->getVL();
2345
2346 for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2347 if (tmp_mask & 0x1)
2348 dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2349 else if (i->isZeroMasking())
2350 dst.vmm64u(n) = 0;
2351 }
2352
2353 BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2354 BX_NEXT_INSTR(i);
2355 }
2356
VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c * i)2357 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c *i)
2358 {
2359 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2360 Bit64u mask1 = 0, mask2 = 0;
2361 unsigned len = i->getVL();
2362
2363 for (unsigned n=0;n < DWORD_ELEMENTS(len); n++) {
2364 for (unsigned m=0;m < DWORD_ELEMENTS(len); m++) {
2365 if (op1.vmm32u(n) == op2.vmm32u(m)) {
2366 mask1 |= 1<<n;
2367 mask2 |= 1<<m;
2368 }
2369 }
2370 }
2371
2372 unsigned mask_base = i->dst() & ~1;
2373 BX_WRITE_OPMASK(mask_base, mask1);
2374 BX_WRITE_OPMASK(mask_base+1, mask2);
2375
2376 BX_NEXT_INSTR(i);
2377 }
2378
VP2INTERSECTQ_KGqHdqWdqR(bxInstruction_c * i)2379 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTQ_KGqHdqWdqR(bxInstruction_c *i)
2380 {
2381 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2382 Bit64u mask1 = 0, mask2 = 0;
2383 unsigned len = i->getVL();
2384
2385 for (unsigned n=0;n < QWORD_ELEMENTS(len); n++) {
2386 for (unsigned m=0;m < QWORD_ELEMENTS(len); m++) {
2387 if (op1.vmm64u(n) == op2.vmm64u(m)) {
2388 mask1 |= 1<<n;
2389 mask2 |= 1<<m;
2390 }
2391 }
2392 }
2393
2394 unsigned mask_base = i->dst() & ~1;
2395 BX_WRITE_OPMASK(mask_base, mask1);
2396 BX_WRITE_OPMASK(mask_base+1, mask2);
2397
2398 BX_NEXT_INSTR(i);
2399 }
2400
2401 #endif
2402