1 /////////////////////////////////////////////////////////////////////////
2 // $Id: avx512.cc 13853 2020-05-19 16:01:23Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //   Copyright (c) 2013-2019 Stanislav Shwartsman
6 //          Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 //  This library is free software; you can redistribute it and/or
9 //  modify it under the terms of the GNU Lesser General Public
10 //  License as published by the Free Software Foundation; either
11 //  version 2 of the License, or (at your option) any later version.
12 //
13 //  This library is distributed in the hope that it will be useful,
14 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 //  Lesser General Public License for more details.
17 //
18 //  You should have received a copy of the GNU Lesser General Public
19 //  License along with this library; if not, write to the Free Software
20 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23 
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28 
29 #if BX_SUPPORT_EVEX
30 
31 #include "simd_int.h"
32 #include "simd_compare.h"
33 #include "wide_int.h"
34 
35 // compare
36 
37 typedef Bit32u (*avx512_compare_method)(const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2);
38 
39 static avx512_compare_method avx512_compare8[8] = {
40   xmm_pcmpeqb_mask,     // equal
41   xmm_pcmpltb_mask,     // less than
42   xmm_pcmpleb_mask,     // less or equal than
43   xmm_pcmpfalse_mask,   // false
44   xmm_pcmpneb_mask,     // not equal
45   xmm_pcmpgeb_mask,     // not less than => greater or equal than
46   xmm_pcmpgtb_mask,     // not less or equal than => greater than
47   xmm_pcmptrueb_mask    // true
48 };
49 
50 static avx512_compare_method avx512_compare16[8] = {
51   xmm_pcmpeqw_mask,     // equal
52   xmm_pcmpltw_mask,     // less than
53   xmm_pcmplew_mask,     // less or equal than
54   xmm_pcmpfalse_mask,   // false
55   xmm_pcmpnew_mask,     // not equal
56   xmm_pcmpgew_mask,     // not less than => greater or equal than
57   xmm_pcmpgtw_mask,     // not less or equal than => greater than
58   xmm_pcmptruew_mask    // true
59 };
60 
61 static avx512_compare_method avx512_compare32[8] = {
62   xmm_pcmpeqd_mask,     // equal
63   xmm_pcmpltd_mask,     // less than
64   xmm_pcmpled_mask,     // less or equal than
65   xmm_pcmpfalse_mask,   // false
66   xmm_pcmpned_mask,     // not equal
67   xmm_pcmpged_mask,     // not less than => greater or equal than
68   xmm_pcmpgtd_mask,     // not less or equal than => greater than
69   xmm_pcmptrued_mask    // true
70 };
71 
72 static avx512_compare_method avx512_compare64[8] = {
73   xmm_pcmpeqq_mask,     // equal
74   xmm_pcmpltq_mask,     // less than
75   xmm_pcmpleq_mask,     // less or equal than
76   xmm_pcmpfalse_mask,   // false
77   xmm_pcmpneq_mask,     // not equal
78   xmm_pcmpgeq_mask,     // not less than => greater or equal than
79   xmm_pcmpgtq_mask,     // not less or equal than => greater than
80   xmm_pcmptrueq_mask    // true
81 };
82 
83 static avx512_compare_method avx512_compare8u[8] = {
84   xmm_pcmpeqb_mask,     // equal
85   xmm_pcmpltub_mask,    // less than
86   xmm_pcmpleub_mask,    // less or equal than
87   xmm_pcmpfalse_mask,   // false
88   xmm_pcmpneb_mask,     // not equal
89   xmm_pcmpgeub_mask,    // not less than => greater or equal than
90   xmm_pcmpgtub_mask,    // not less or equal than => greater than
91   xmm_pcmptrueb_mask    // true
92 };
93 
94 static avx512_compare_method avx512_compare16u[8] = {
95   xmm_pcmpeqw_mask,     // equal
96   xmm_pcmpltuw_mask,    // less than
97   xmm_pcmpleuw_mask,    // less or equal than
98   xmm_pcmpfalse_mask,   // false
99   xmm_pcmpnew_mask,     // not equal
100   xmm_pcmpgeuw_mask,    // not less than => greater or equal than
101   xmm_pcmpgtuw_mask,    // not less or equal than => greater than
102   xmm_pcmptruew_mask    // true
103 };
104 
105 static avx512_compare_method avx512_compare32u[8] = {
106   xmm_pcmpeqd_mask,     // equal
107   xmm_pcmpltud_mask,    // less than
108   xmm_pcmpleud_mask,    // less or equal than
109   xmm_pcmpfalse_mask,   // false
110   xmm_pcmpned_mask,     // not equal
111   xmm_pcmpgeud_mask,    // not less than => greater or equal than
112   xmm_pcmpgtud_mask,    // not less or equal than => greater than
113   xmm_pcmptrued_mask    // true
114 };
115 
116 static avx512_compare_method avx512_compare64u[8] = {
117   xmm_pcmpeqq_mask,     // equal
118   xmm_pcmpltuq_mask,    // less than
119   xmm_pcmpleuq_mask,    // less or equal than
120   xmm_pcmpfalse_mask,   // false
121   xmm_pcmpneq_mask,     // not equal
122   xmm_pcmpgeuq_mask,    // not less than => greater or equal than
123   xmm_pcmpgtuq_mask,    // not less or equal than => greater than
124   xmm_pcmptrueq_mask    // true
125 };
126 
VPCMPB_MASK_KGqHdqWdqIbR(bxInstruction_c * i)127 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPB_MASK_KGqHdqWdqIbR(bxInstruction_c *i)
128 {
129   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
130   unsigned len = i->getVL();
131   unsigned ib = i->Ib() & 7;
132 
133   Bit64u result = 0;
134   for (int n=len-1; n >= 0; n--) {
135     result <<= 16;
136     result |= avx512_compare8[ib](&op1.vmm128(n), &op2.vmm128(n));
137   }
138 
139   if (i->opmask())
140     result &= BX_READ_OPMASK(i->opmask());
141 
142   BX_WRITE_OPMASK(i->dst(), result);
143   BX_NEXT_INSTR(i);
144 }
145 
VPCMPUB_MASK_KGqHdqWdqIbR(bxInstruction_c * i)146 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUB_MASK_KGqHdqWdqIbR(bxInstruction_c *i)
147 {
148   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
149   unsigned len = i->getVL();
150   unsigned ib = i->Ib() & 7;
151 
152   Bit64u result = 0;
153   for (int n=len-1; n >= 0; n--) {
154     result <<= 16;
155     result |= avx512_compare8u[ib](&op1.vmm128(n), &op2.vmm128(n));
156   }
157 
158   if (i->opmask())
159     result &= BX_READ_OPMASK(i->opmask());
160 
161   BX_WRITE_OPMASK(i->dst(), result);
162   BX_NEXT_INSTR(i);
163 }
164 
VPCMPW_MASK_KGdHdqWdqIbR(bxInstruction_c * i)165 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPW_MASK_KGdHdqWdqIbR(bxInstruction_c *i)
166 {
167   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
168   unsigned len = i->getVL();
169   unsigned ib = i->Ib() & 7;
170 
171   Bit32u result = 0;
172   for (int n=len-1; n >= 0; n--) {
173     result <<= 8;
174     result |= avx512_compare16[ib](&op1.vmm128(n), &op2.vmm128(n));
175   }
176 
177   if (i->opmask())
178     result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask());
179 
180   BX_WRITE_OPMASK(i->dst(), result);
181   BX_NEXT_INSTR(i);
182 }
183 
VPCMPUW_MASK_KGdHdqWdqIbR(bxInstruction_c * i)184 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUW_MASK_KGdHdqWdqIbR(bxInstruction_c *i)
185 {
186   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
187   unsigned len = i->getVL();
188   unsigned ib = i->Ib() & 7;
189 
190   Bit32u result = 0;
191   for (int n=len-1; n >= 0; n--) {
192     result <<= 8;
193     result |= avx512_compare16u[ib](&op1.vmm128(n), &op2.vmm128(n));
194   }
195 
196   if (i->opmask())
197     result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask());
198 
199   BX_WRITE_OPMASK(i->dst(), result);
200   BX_NEXT_INSTR(i);
201 }
202 
VPCMPD_MASK_KGwHdqWdqIbR(bxInstruction_c * i)203 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPD_MASK_KGwHdqWdqIbR(bxInstruction_c *i)
204 {
205   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
206   unsigned len = i->getVL();
207   unsigned ib = i->Ib() & 7;
208 
209   Bit32u result = 0;
210   for (int n=len-1; n >= 0; n--) {
211     result <<= 4;
212     result |= avx512_compare32[ib](&op1.vmm128(n), &op2.vmm128(n));
213   }
214 
215   if (i->opmask())
216     result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask());
217 
218   BX_WRITE_OPMASK(i->dst(), result);
219   BX_NEXT_INSTR(i);
220 }
221 
VPCMPUD_MASK_KGwHdqWdqIbR(bxInstruction_c * i)222 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUD_MASK_KGwHdqWdqIbR(bxInstruction_c *i)
223 {
224   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
225   unsigned len = i->getVL();
226   unsigned ib = i->Ib() & 7;
227 
228   Bit32u result = 0;
229   for (int n=len-1; n >= 0; n--) {
230     result <<= 4;
231     result |= avx512_compare32u[ib](&op1.vmm128(n), &op2.vmm128(n));
232   }
233 
234   if (i->opmask())
235     result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask());
236 
237   BX_WRITE_OPMASK(i->dst(), result);
238   BX_NEXT_INSTR(i);
239 }
240 
VPCMPQ_MASK_KGbHdqWdqIbR(bxInstruction_c * i)241 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPQ_MASK_KGbHdqWdqIbR(bxInstruction_c *i)
242 {
243   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
244   unsigned len = i->getVL();
245   unsigned ib = i->Ib() & 7;
246 
247   Bit32u result = 0;
248   for (int n=len-1; n >= 0; n--) {
249     result <<= 2;
250     result |= avx512_compare64[ib](&op1.vmm128(n), &op2.vmm128(n));
251   }
252 
253   if (i->opmask())
254     result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask());
255 
256   BX_WRITE_OPMASK(i->dst(), result);
257   BX_NEXT_INSTR(i);
258 }
259 
VPCMPUQ_MASK_KGbHdqWdqIbR(bxInstruction_c * i)260 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMPUQ_MASK_KGbHdqWdqIbR(bxInstruction_c *i)
261 {
262   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
263   unsigned len = i->getVL();
264   unsigned ib = i->Ib() & 7;
265 
266   Bit32u result = 0;
267   for (int n=len-1; n >= 0; n--) {
268     result <<= 2;
269     result |= avx512_compare64u[ib](&op1.vmm128(n), &op2.vmm128(n));
270   }
271 
272   if (i->opmask())
273     result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask());
274 
275   BX_WRITE_OPMASK(i->dst(), result);
276   BX_NEXT_INSTR(i);
277 }
278 
279 ///////////////////////////////////////////////////////////////////////////////////////////
280 
281 #define AVX512_COMPARE_BYTE_EL(HANDLER, func)                                               \
282   /* AVX-512 compare instruction with two src operands working on BYTE elements */          \
283   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
284   {                                                                                         \
285     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
286     unsigned len = i->getVL();                                                              \
287                                                                                             \
288     Bit64u result = 0;                                                                      \
289     for (int n=len-1; n >= 0; n--) {                                                        \
290       result <<= 16;                                                                        \
291       result |= (func)(&op1.vmm128(n), &op2.vmm128(n));                                     \
292     }                                                                                       \
293                                                                                             \
294     if (i->opmask())                                                                        \
295       result &= BX_READ_OPMASK(i->opmask());                                                \
296                                                                                             \
297     BX_WRITE_OPMASK(i->dst(), result);                                                      \
298     BX_NEXT_INSTR(i);                                                                       \
299   }
300 
301 AVX512_COMPARE_BYTE_EL(VPCMPGTB_MASK_KGqHdqWdqR, xmm_pcmpgtb_mask)
302 AVX512_COMPARE_BYTE_EL(VPCMPEQB_MASK_KGqHdqWdqR, xmm_pcmpeqb_mask)
303 AVX512_COMPARE_BYTE_EL(VPTESTMB_MASK_KGqHdqWdqR, xmm_ptestmb_mask)
304 AVX512_COMPARE_BYTE_EL(VPTESTNMB_MASK_KGqHdqWdqR, xmm_ptestnmb_mask)
305 
306 #define AVX512_COMPARE_WORD_EL(HANDLER, func)                                               \
307   /* AVX-512 compare instruction with two src operands working on WORD elements */          \
308   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
309   {                                                                                         \
310     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
311     unsigned len = i->getVL();                                                              \
312                                                                                             \
313     Bit32u result = 0;                                                                      \
314     for (int n=len-1; n >= 0; n--) {                                                        \
315       result <<= 8;                                                                         \
316       result |= (func)(&op1.vmm128(n), &op2.vmm128(n));                                     \
317     }                                                                                       \
318                                                                                             \
319     if (i->opmask())                                                                        \
320       result &= (Bit32u) BX_READ_32BIT_OPMASK(i->opmask());                                 \
321                                                                                             \
322     BX_WRITE_OPMASK(i->dst(), result);                                                      \
323     BX_NEXT_INSTR(i);                                                                       \
324   }
325 
326 AVX512_COMPARE_WORD_EL(VPCMPGTW_MASK_KGdHdqWdqR, xmm_pcmpgtw_mask)
327 AVX512_COMPARE_WORD_EL(VPCMPEQW_MASK_KGdHdqWdqR, xmm_pcmpeqw_mask)
328 AVX512_COMPARE_WORD_EL(VPTESTMW_MASK_KGdHdqWdqR, xmm_ptestmw_mask)
329 AVX512_COMPARE_WORD_EL(VPTESTNMW_MASK_KGdHdqWdqR, xmm_ptestnmw_mask)
330 
331 #define AVX512_COMPARE_DWORD_EL(HANDLER, func)                                              \
332   /* AVX-512 compare instruction with two src operands working on DWORD elements */         \
333   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
334   {                                                                                         \
335     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
336     unsigned len = i->getVL();                                                              \
337                                                                                             \
338     Bit32u result = 0;                                                                      \
339     for (int n=len-1; n >= 0; n--) {                                                        \
340       result <<= 4;                                                                         \
341       result |= (func)(&op1.vmm128(n), &op2.vmm128(n));                                     \
342     }                                                                                       \
343                                                                                             \
344     if (i->opmask())                                                                        \
345       result &= (Bit32u) BX_READ_16BIT_OPMASK(i->opmask());                                 \
346                                                                                             \
347     BX_WRITE_OPMASK(i->dst(), result);                                                      \
348     BX_NEXT_INSTR(i);                                                                       \
349   }
350 
351 AVX512_COMPARE_DWORD_EL(VPCMPGTD_MASK_KGwHdqWdqR, xmm_pcmpgtd_mask)
352 AVX512_COMPARE_DWORD_EL(VPCMPEQD_MASK_KGwHdqWdqR, xmm_pcmpeqd_mask)
353 AVX512_COMPARE_DWORD_EL(VPTESTMD_MASK_KGwHdqWdqR, xmm_ptestmd_mask)
354 AVX512_COMPARE_DWORD_EL(VPTESTNMD_MASK_KGwHdqWdqR, xmm_ptestnmd_mask)
355 
356 #define AVX512_COMPARE_QWORD_EL(HANDLER, func)                                              \
357   /* AVX-512 compare instruction with two src operands working on QWORD elements */         \
358   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
359   {                                                                                         \
360     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
361     unsigned len = i->getVL();                                                              \
362                                                                                             \
363     Bit32u result = 0;                                                                      \
364     for (int n=len-1; n >= 0; n--) {                                                        \
365       result <<= 2;                                                                         \
366       result |= (func)(&op1.vmm128(n), &op2.vmm128(n));                                     \
367     }                                                                                       \
368                                                                                             \
369     if (i->opmask())                                                                        \
370       result &= (Bit32u) BX_READ_8BIT_OPMASK(i->opmask());                                  \
371                                                                                             \
372     BX_WRITE_OPMASK(i->dst(), result);                                                      \
373     BX_NEXT_INSTR(i);                                                                       \
374   }
375 
376 AVX512_COMPARE_QWORD_EL(VPCMPGTQ_MASK_KGbHdqWdqR, xmm_pcmpgtq_mask)
377 AVX512_COMPARE_QWORD_EL(VPCMPEQQ_MASK_KGbHdqWdqR, xmm_pcmpeqq_mask)
378 AVX512_COMPARE_QWORD_EL(VPTESTMQ_MASK_KGbHdqWdqR, xmm_ptestmq_mask)
379 AVX512_COMPARE_QWORD_EL(VPTESTNMQ_MASK_KGbHdqWdqR, xmm_ptestnmq_mask)
380 
381 // compute, shift and rotate
382 
383 #define AVX512_2OP_QWORD_EL(HANDLER, func)                                                  \
384   /* AVX-512 instruction with two src operands working on QWORD elements */                 \
385   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
386   {                                                                                         \
387     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
388     unsigned len = i->getVL();                                                              \
389                                                                                             \
390     for (unsigned n=0; n < len; n++)                                                        \
391       (func)(&op1.vmm128(n), &op2.vmm128(n));                                               \
392                                                                                             \
393     avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask()));               \
394                                                                                             \
395     BX_NEXT_INSTR(i);                                                                       \
396   }
397 
398 AVX512_2OP_QWORD_EL(VPMULLQ_MASK_VdqHdqWdqR, xmm_pmullq)
399 AVX512_2OP_QWORD_EL(VPADDQ_MASK_VdqHdqWdqR, xmm_paddq)
400 AVX512_2OP_QWORD_EL(VPSUBQ_MASK_VdqHdqWdqR, xmm_psubq)
401 AVX512_2OP_QWORD_EL(VPANDQ_MASK_VdqHdqWdqR, xmm_andps)
402 AVX512_2OP_QWORD_EL(VPANDNQ_MASK_VdqHdqWdqR, xmm_andnps)
403 AVX512_2OP_QWORD_EL(VPORQ_MASK_VdqHdqWdqR, xmm_orps)
404 AVX512_2OP_QWORD_EL(VPXORQ_MASK_VdqHdqWdqR, xmm_xorps)
405 AVX512_2OP_QWORD_EL(VPMAXSQ_MASK_VdqHdqWdqR, xmm_pmaxsq)
406 AVX512_2OP_QWORD_EL(VPMAXUQ_MASK_VdqHdqWdqR, xmm_pmaxuq)
407 AVX512_2OP_QWORD_EL(VPMINSQ_MASK_VdqHdqWdqR, xmm_pminsq)
408 AVX512_2OP_QWORD_EL(VPMINUQ_MASK_VdqHdqWdqR, xmm_pminuq)
409 AVX512_2OP_QWORD_EL(VUNPCKLPD_MASK_VpdHpdWpdR, xmm_unpcklpd)
410 AVX512_2OP_QWORD_EL(VUNPCKHPD_MASK_VpdHpdWpdR, xmm_unpckhpd)
411 AVX512_2OP_QWORD_EL(VPMULDQ_MASK_VdqHdqWdqR, xmm_pmuldq)
412 AVX512_2OP_QWORD_EL(VPMULUDQ_MASK_VdqHdqWdqR, xmm_pmuludq)
413 AVX512_2OP_QWORD_EL(VPSRAVQ_MASK_VdqHdqWdqR, xmm_psravq)
414 AVX512_2OP_QWORD_EL(VPSRLVQ_MASK_VdqHdqWdqR, xmm_psrlvq)
415 AVX512_2OP_QWORD_EL(VPSLLVQ_MASK_VdqHdqWdqR, xmm_psllvq)
416 AVX512_2OP_QWORD_EL(VPRORVQ_MASK_VdqHdqWdqR, xmm_prorvq)
417 AVX512_2OP_QWORD_EL(VPROLVQ_MASK_VdqHdqWdqR, xmm_prolvq)
418 
419 #define AVX512_2OP_DWORD_EL(HANDLER, func)                                                  \
420   /* AVX-512 instruction with two src operands working on DWORD elements */                 \
421   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
422   {                                                                                         \
423     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
424     unsigned len = i->getVL();                                                              \
425                                                                                             \
426     for (unsigned n=0; n < len; n++)                                                        \
427       (func)(&op1.vmm128(n), &op2.vmm128(n));                                               \
428                                                                                             \
429     avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask()));              \
430                                                                                             \
431     BX_NEXT_INSTR(i);                                                                       \
432   }
433 
434 AVX512_2OP_DWORD_EL(VPADDD_MASK_VdqHdqWdqR, xmm_paddd)
435 AVX512_2OP_DWORD_EL(VPSUBD_MASK_VdqHdqWdqR, xmm_psubd)
436 AVX512_2OP_DWORD_EL(VPANDD_MASK_VdqHdqWdqR, xmm_andps)
437 AVX512_2OP_DWORD_EL(VPANDND_MASK_VdqHdqWdqR, xmm_andnps)
438 AVX512_2OP_DWORD_EL(VPORD_MASK_VdqHdqWdqR, xmm_orps)
439 AVX512_2OP_DWORD_EL(VPXORD_MASK_VdqHdqWdqR, xmm_xorps)
440 AVX512_2OP_DWORD_EL(VPMAXSD_MASK_VdqHdqWdqR, xmm_pmaxsd)
441 AVX512_2OP_DWORD_EL(VPMAXUD_MASK_VdqHdqWdqR, xmm_pmaxud)
442 AVX512_2OP_DWORD_EL(VPMINSD_MASK_VdqHdqWdqR, xmm_pminsd)
443 AVX512_2OP_DWORD_EL(VPMINUD_MASK_VdqHdqWdqR, xmm_pminud)
444 AVX512_2OP_DWORD_EL(VUNPCKLPS_MASK_VpsHpsWpsR, xmm_unpcklps)
445 AVX512_2OP_DWORD_EL(VUNPCKHPS_MASK_VpsHpsWpsR, xmm_unpckhps)
446 AVX512_2OP_DWORD_EL(VPMULLD_MASK_VdqHdqWdqR, xmm_pmulld)
447 AVX512_2OP_DWORD_EL(VPSRAVD_MASK_VdqHdqWdqR, xmm_psravd)
448 AVX512_2OP_DWORD_EL(VPSRLVD_MASK_VdqHdqWdqR, xmm_psrlvd)
449 AVX512_2OP_DWORD_EL(VPSLLVD_MASK_VdqHdqWdqR, xmm_psllvd)
450 AVX512_2OP_DWORD_EL(VPRORVD_MASK_VdqHdqWdqR, xmm_prorvd)
451 AVX512_2OP_DWORD_EL(VPROLVD_MASK_VdqHdqWdqR, xmm_prolvd)
452 AVX512_2OP_DWORD_EL(VPMADDWD_MASK_VdqHdqWdqR, xmm_pmaddwd)
453 
454 #define AVX512_2OP_WORD_EL(HANDLER, func)                                                   \
455   /* AVX-512 instruction with two src operands working on WORD elements */                  \
456   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
457   {                                                                                         \
458     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
459     unsigned len = i->getVL();                                                              \
460                                                                                             \
461     for (unsigned n=0; n < len; n++)                                                        \
462       (func)(&op1.vmm128(n), &op2.vmm128(n));                                               \
463                                                                                             \
464     avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask()));              \
465                                                                                             \
466     BX_NEXT_INSTR(i);                                                                       \
467   }
468 
469 AVX512_2OP_WORD_EL(VPADDW_MASK_VdqHdqWdqR, xmm_paddw)
470 AVX512_2OP_WORD_EL(VPADDSW_MASK_VdqHdqWdqR, xmm_paddsw)
471 AVX512_2OP_WORD_EL(VPADDUSW_MASK_VdqHdqWdqR, xmm_paddusw)
472 AVX512_2OP_WORD_EL(VPSUBW_MASK_VdqHdqWdqR, xmm_psubw)
473 AVX512_2OP_WORD_EL(VPSUBSW_MASK_VdqHdqWdqR, xmm_psubsw)
474 AVX512_2OP_WORD_EL(VPSUBUSW_MASK_VdqHdqWdqR, xmm_psubusw)
475 AVX512_2OP_WORD_EL(VPMINSW_MASK_VdqHdqWdqR, xmm_pminsw)
476 AVX512_2OP_WORD_EL(VPMINUW_MASK_VdqHdqWdqR, xmm_pminuw)
477 AVX512_2OP_WORD_EL(VPMAXSW_MASK_VdqHdqWdqR, xmm_pmaxsw)
478 AVX512_2OP_WORD_EL(VPMAXUW_MASK_VdqHdqWdqR, xmm_pmaxuw)
479 AVX512_2OP_WORD_EL(VPMADDUBSW_MASK_VdqHdqWdqR, xmm_pmaddubsw)
480 AVX512_2OP_WORD_EL(VPAVGW_MASK_VdqHdqWdqR, xmm_pavgw)
481 AVX512_2OP_WORD_EL(VPMULLW_MASK_VdqHdqWdqR, xmm_pmullw)
482 AVX512_2OP_WORD_EL(VPMULHW_MASK_VdqHdqWdqR, xmm_pmulhw)
483 AVX512_2OP_WORD_EL(VPMULHUW_MASK_VdqHdqWdqR, xmm_pmulhuw)
484 AVX512_2OP_WORD_EL(VPMULHRSW_MASK_VdqHdqWdqR, xmm_pmulhrsw)
485 AVX512_2OP_WORD_EL(VPACKSSDW_MASK_VdqHdqWdqR, xmm_packssdw)
486 AVX512_2OP_WORD_EL(VPACKUSDW_MASK_VdqHdqWdqR, xmm_packusdw)
487 AVX512_2OP_WORD_EL(VPUNPCKLWD_MASK_VdqHdqWdqR, xmm_punpcklwd)
488 AVX512_2OP_WORD_EL(VPUNPCKHWD_MASK_VdqHdqWdqR, xmm_punpckhwd)
489 AVX512_2OP_WORD_EL(VPSRAVW_MASK_VdqHdqWdqR, xmm_psravw)
490 AVX512_2OP_WORD_EL(VPSRLVW_MASK_VdqHdqWdqR, xmm_psrlvw)
491 AVX512_2OP_WORD_EL(VPSLLVW_MASK_VdqHdqWdqR, xmm_psllvw)
492 
493 #define AVX512_2OP_BYTE_EL(HANDLER, func)                                                   \
494   /* AVX-512 instruction with two src operands working on BYTE elements */                  \
495   void BX_CPP_AttrRegparmN(1) BX_CPU_C :: HANDLER (bxInstruction_c *i)              \
496   {                                                                                         \
497     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
498     unsigned len = i->getVL();                                                              \
499                                                                                             \
500     for (unsigned n=0; n < len; n++)                                                        \
501       (func)(&op1.vmm128(n), &op2.vmm128(n));                                               \
502                                                                                             \
503     avx512_write_regb_masked(i, &op1, len, BX_READ_OPMASK(i->opmask()));                    \
504                                                                                             \
505     BX_NEXT_INSTR(i);                                                                       \
506   }
507 
508 AVX512_2OP_BYTE_EL(VPADDB_MASK_VdqHdqWdqR, xmm_paddb)
509 AVX512_2OP_BYTE_EL(VPADDSB_MASK_VdqHdqWdqR, xmm_paddsb)
510 AVX512_2OP_BYTE_EL(VPADDUSB_MASK_VdqHdqWdqR, xmm_paddusb)
511 AVX512_2OP_BYTE_EL(VPSUBB_MASK_VdqHdqWdqR, xmm_psubb)
512 AVX512_2OP_BYTE_EL(VPSUBSB_MASK_VdqHdqWdqR, xmm_psubsb)
513 AVX512_2OP_BYTE_EL(VPSUBUSB_MASK_VdqHdqWdqR, xmm_psubusb)
514 AVX512_2OP_BYTE_EL(VPMINSB_MASK_VdqHdqWdqR, xmm_pminsb)
515 AVX512_2OP_BYTE_EL(VPMINUB_MASK_VdqHdqWdqR, xmm_pminub)
516 AVX512_2OP_BYTE_EL(VPMAXUB_MASK_VdqHdqWdqR, xmm_pmaxub)
517 AVX512_2OP_BYTE_EL(VPMAXSB_MASK_VdqHdqWdqR, xmm_pmaxsb)
518 AVX512_2OP_BYTE_EL(VPAVGB_MASK_VdqHdqWdqR, xmm_pavgb)
519 AVX512_2OP_BYTE_EL(VPACKSSWB_MASK_VdqHdqWdqR, xmm_packsswb)
520 AVX512_2OP_BYTE_EL(VPACKUSWB_MASK_VdqHdqWdqR, xmm_packuswb)
521 AVX512_2OP_BYTE_EL(VPUNPCKLBW_MASK_VdqHdqWdqR, xmm_punpcklbw)
522 AVX512_2OP_BYTE_EL(VPUNPCKHBW_MASK_VdqHdqWdqR, xmm_punpckhbw)
523 
524 #define AVX512_PSHIFT_WORD_EL(HANDLER, func)                                  \
525   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
526   {                                                                           \
527     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src1());                     \
528     Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2());                       \
529     unsigned len = i->getVL();                                                \
530                                                                               \
531     for (unsigned n=0; n < len; n++)                                          \
532       (func)(&op.vmm128(n), count);                                           \
533                                                                               \
534     avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask())); \
535                                                                               \
536     BX_NEXT_INSTR(i);                                                         \
537   }
538 
539 AVX512_PSHIFT_WORD_EL(VPSRLW_MASK_VdqHdqWdqR, xmm_psrlw);
540 AVX512_PSHIFT_WORD_EL(VPSRAW_MASK_VdqHdqWdqR, xmm_psraw);
541 AVX512_PSHIFT_WORD_EL(VPSLLW_MASK_VdqHdqWdqR, xmm_psllw);
542 
543 #define AVX512_PSHIFT_DWORD_EL(HANDLER, func)                                 \
544   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
545   {                                                                           \
546     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src1());                     \
547     Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2());                       \
548     unsigned len = i->getVL();                                                \
549                                                                               \
550     for (unsigned n=0; n < len; n++)                                          \
551       (func)(&op.vmm128(n), count);                                           \
552                                                                               \
553     avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask())); \
554                                                                               \
555     BX_NEXT_INSTR(i);                                                         \
556   }
557 
558 AVX512_PSHIFT_DWORD_EL(VPSRLD_MASK_VdqHdqWdqR, xmm_psrld);
559 AVX512_PSHIFT_DWORD_EL(VPSRAD_MASK_VdqHdqWdqR, xmm_psrad);
560 AVX512_PSHIFT_DWORD_EL(VPSLLD_MASK_VdqHdqWdqR, xmm_pslld);
561 
562 #define AVX512_PSHIFT_QWORD_EL(HANDLER, func)                                 \
563   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
564   {                                                                           \
565     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src1());                     \
566     Bit64u count = BX_READ_XMM_REG_LO_QWORD(i->src2());                       \
567     unsigned len = i->getVL();                                                \
568                                                                               \
569     for (unsigned n=0; n < len; n++)                                          \
570       (func)(&op.vmm128(n), count);                                           \
571                                                                               \
572     avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));  \
573                                                                               \
574     BX_NEXT_INSTR(i);                                                         \
575   }
576 
577 AVX512_PSHIFT_QWORD_EL(VPSRLQ_MASK_VdqHdqWdqR, xmm_psrlq);
578 AVX512_PSHIFT_QWORD_EL(VPSRAQ_MASK_VdqHdqWdqR, xmm_psraq);
579 AVX512_PSHIFT_QWORD_EL(VPSLLQ_MASK_VdqHdqWdqR, xmm_psllq);
580 
581 #define AVX512_PSHIFT_IMM_WORD_EL(HANDLER, func)                              \
582   /* AVX packed shift with imm8 instruction */                                \
583   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
584   {                                                                           \
585     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src());                      \
586     unsigned len = i->getVL();                                                \
587                                                                               \
588     for (unsigned n=0; n < len; n++)                                          \
589       (func)(&op.vmm128(n), i->Ib());                                         \
590                                                                               \
591     avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask())); \
592                                                                               \
593     BX_NEXT_INSTR(i);                                                         \
594   }
595 
596 AVX512_PSHIFT_IMM_WORD_EL(VPSRLW_MASK_UdqIb, xmm_psrlw);
597 AVX512_PSHIFT_IMM_WORD_EL(VPSRAW_MASK_UdqIb, xmm_psraw);
598 AVX512_PSHIFT_IMM_WORD_EL(VPSLLW_MASK_UdqIb, xmm_psllw);
599 
600 #define AVX512_PSHIFT_IMM_DWORD_EL(HANDLER, func)                             \
601   /* AVX packed shift with imm8 instruction */                                \
602   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
603   {                                                                           \
604     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src());                      \
605     unsigned len = i->getVL();                                                \
606                                                                               \
607     for (unsigned n=0; n < len; n++)                                          \
608       (func)(&op.vmm128(n), i->Ib());                                         \
609                                                                               \
610     avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask())); \
611                                                                               \
612     BX_NEXT_INSTR(i);                                                         \
613   }
614 
615 AVX512_PSHIFT_IMM_DWORD_EL(VPSRLD_MASK_UdqIb, xmm_psrld);
616 AVX512_PSHIFT_IMM_DWORD_EL(VPSRAD_MASK_UdqIb, xmm_psrad);
617 AVX512_PSHIFT_IMM_DWORD_EL(VPSLLD_MASK_UdqIb, xmm_pslld);
618 AVX512_PSHIFT_IMM_DWORD_EL(VPRORD_MASK_UdqIb, xmm_prord);
619 AVX512_PSHIFT_IMM_DWORD_EL(VPROLD_MASK_UdqIb, xmm_prold);
620 
621 #define AVX512_PSHIFT_IMM_QWORD_EL(HANDLER, func)                             \
622   /* AVX packed shift with imm8 instruction */                                \
623   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
624   {                                                                           \
625     BxPackedAvxRegister op  = BX_READ_AVX_REG(i->src());                      \
626     unsigned len = i->getVL();                                                \
627                                                                               \
628     for (unsigned n=0; n < len; n++)                                          \
629       (func)(&op.vmm128(n), i->Ib());                                         \
630                                                                               \
631     avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));  \
632                                                                               \
633     BX_NEXT_INSTR(i);                                                         \
634   }
635 
636 AVX512_PSHIFT_IMM_QWORD_EL(VPSRLQ_MASK_UdqIb, xmm_psrlq);
637 AVX512_PSHIFT_IMM_QWORD_EL(VPSRAQ_MASK_UdqIb, xmm_psraq);
638 AVX512_PSHIFT_IMM_QWORD_EL(VPSLLQ_MASK_UdqIb, xmm_psllq);
639 AVX512_PSHIFT_IMM_QWORD_EL(VPRORQ_MASK_UdqIb, xmm_prorq);
640 AVX512_PSHIFT_IMM_QWORD_EL(VPROLQ_MASK_UdqIb, xmm_prolq);
641 
642 // concatenate and shift
643 
VPSHLDW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)644 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
645 {
646   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
647   unsigned len = i->getVL();
648   unsigned count = i->Ib() & 15;
649 
650   if (count) {
651     for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
652       op1.vmm16u(n) = (op1.vmm16u(n) << count) | (op2.vmm16u(n) >> (16 - count));
653     }
654   }
655 
656   if (i->opmask())
657     avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask()));
658   else
659     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
660 
661   BX_NEXT_INSTR(i);
662 }
663 
VPSHLDVW_MASK_VdqHdqWdqR(bxInstruction_c * i)664 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVW_MASK_VdqHdqWdqR(bxInstruction_c *i)
665 {
666   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
667   unsigned len = i->getVL();
668 
669   for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
670     unsigned count = op2.vmm16u(n) & 15;
671     if (count) {
672       dst.vmm16u(n) = (dst.vmm16u(n) << count) | (op1.vmm16u(n) >> (16 - count));
673     }
674   }
675 
676   if (i->opmask())
677     avx512_write_regw_masked(i, &dst, len, BX_READ_32BIT_OPMASK(i->opmask()));
678   else
679     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
680 
681   BX_NEXT_INSTR(i);
682 }
683 
VPSHLDD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)684 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
685 {
686   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
687   unsigned len = i->getVL();
688   unsigned count = i->Ib() & 31;
689 
690   if (count) {
691     for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
692       op1.vmm32u(n) = (op1.vmm32u(n) << count) | (op2.vmm32u(n) >> (32 - count));
693     }
694   }
695 
696   if (i->opmask())
697     avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask()));
698   else
699     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
700 
701   BX_NEXT_INSTR(i);
702 }
703 
VPSHLDVD_MASK_VdqHdqWdqR(bxInstruction_c * i)704 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVD_MASK_VdqHdqWdqR(bxInstruction_c *i)
705 {
706   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
707   unsigned len = i->getVL();
708 
709   for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
710     unsigned count = op2.vmm32u(n) & 31;
711     if (count) {
712       dst.vmm32u(n) = (dst.vmm32u(n) << count) | (op1.vmm32u(n) >> (32 - count));
713     }
714   }
715 
716   if (i->opmask())
717     avx512_write_regd_masked(i, &dst, len, BX_READ_16BIT_OPMASK(i->opmask()));
718   else
719     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
720 
721   BX_NEXT_INSTR(i);
722 }
723 
VPSHLDQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)724 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
725 {
726   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
727   unsigned len = i->getVL();
728   unsigned count = i->Ib() & 63;
729 
730   if (count) {
731     for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
732       op1.vmm64u(n) = (op1.vmm64u(n) << count) | (op2.vmm64u(n) >> (64 - count));
733     }
734   }
735 
736   if (i->opmask())
737     avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask()));
738   else
739     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
740 
741   BX_NEXT_INSTR(i);
742 }
743 
VPSHLDVQ_MASK_VdqHdqWdqR(bxInstruction_c * i)744 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHLDVQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
745 {
746   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
747   unsigned len = i->getVL();
748 
749   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
750     unsigned count = op2.vmm64u(n) & 63;
751     if (count) {
752       dst.vmm64u(n) = (dst.vmm64u(n) << count) | (op1.vmm64u(n) >> (64 - count));
753     }
754   }
755 
756   if (i->opmask())
757     avx512_write_regq_masked(i, &dst, len, BX_READ_8BIT_OPMASK(i->opmask()));
758   else
759     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
760 
761   BX_NEXT_INSTR(i);
762 }
763 
VPSHRDW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)764 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
765 {
766   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
767   unsigned len = i->getVL();
768   unsigned count = i->Ib() & 15;
769 
770   if (count) {
771     for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
772       op1.vmm16u(n) = (op1.vmm16u(n) >> count) | (op2.vmm16u(n) << (16 - count));
773     }
774   }
775 
776   if (i->opmask())
777     avx512_write_regw_masked(i, &op1, len, BX_READ_32BIT_OPMASK(i->opmask()));
778   else
779     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
780 
781   BX_NEXT_INSTR(i);
782 }
783 
VPSHRDVW_MASK_VdqHdqWdqR(bxInstruction_c * i)784 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVW_MASK_VdqHdqWdqR(bxInstruction_c *i)
785 {
786   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
787   unsigned len = i->getVL();
788 
789   for (unsigned n=0; n < WORD_ELEMENTS(len); n++) {
790     unsigned count = op2.vmm16u(n) & 15;
791     if (count) {
792       dst.vmm16u(n) = (dst.vmm16u(n) >> count) | (op1.vmm16u(n) << (16 - count));
793     }
794   }
795 
796   if (i->opmask())
797     avx512_write_regw_masked(i, &dst, len, BX_READ_32BIT_OPMASK(i->opmask()));
798   else
799     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
800 
801   BX_NEXT_INSTR(i);
802 }
803 
VPSHRDD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)804 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
805 {
806   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
807   unsigned len = i->getVL();
808   unsigned count = i->Ib() & 31;
809 
810   if (count) {
811     for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
812       op1.vmm32u(n) = (op1.vmm32u(n) >> count) | (op2.vmm32u(n) << (32 - count));
813     }
814   }
815 
816   if (i->opmask())
817     avx512_write_regd_masked(i, &op1, len, BX_READ_16BIT_OPMASK(i->opmask()));
818   else
819     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
820 
821   BX_NEXT_INSTR(i);
822 }
823 
VPSHRDVD_MASK_VdqHdqWdqR(bxInstruction_c * i)824 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVD_MASK_VdqHdqWdqR(bxInstruction_c *i)
825 {
826   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
827   unsigned len = i->getVL();
828 
829   for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
830     unsigned count = op2.vmm32u(n) & 31;
831     if (count) {
832       dst.vmm32u(n) = (dst.vmm32u(n) >> count) | (op1.vmm32u(n) << (32 - count));
833     }
834   }
835 
836   if (i->opmask())
837     avx512_write_regd_masked(i, &dst, len, BX_READ_16BIT_OPMASK(i->opmask()));
838   else
839     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
840 
841   BX_NEXT_INSTR(i);
842 }
843 
VPSHRDQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)844 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
845 {
846   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
847   unsigned len = i->getVL();
848   unsigned count = i->Ib() & 63;
849 
850   if (count) {
851     for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
852       op1.vmm64u(n) = (op1.vmm64u(n) >> count) | (op2.vmm64u(n) << (64 - count));
853     }
854   }
855 
856   if (i->opmask())
857     avx512_write_regq_masked(i, &op1, len, BX_READ_8BIT_OPMASK(i->opmask()));
858   else
859     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
860 
861   BX_NEXT_INSTR(i);
862 }
863 
VPSHRDVQ_MASK_VdqHdqWdqR(bxInstruction_c * i)864 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHRDVQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
865 {
866   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
867   unsigned len = i->getVL();
868 
869   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
870     unsigned count = op2.vmm64u(n) & 63;
871     if (count) {
872       dst.vmm64u(n) = (dst.vmm64u(n) >> count) | (op1.vmm64u(n) << (64 - count));
873     }
874   }
875 
876   if (i->opmask())
877     avx512_write_regq_masked(i, &dst, len, BX_READ_8BIT_OPMASK(i->opmask()));
878   else
879     BX_WRITE_AVX_REGZ(i->dst(), dst, len);
880 
881   BX_NEXT_INSTR(i);
882 }
883 
884 
885 // absolute value
886 
VPABSB_MASK_VdqWdqR(bxInstruction_c * i)887 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSB_MASK_VdqWdqR(bxInstruction_c *i)
888 {
889   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
890   unsigned len = i->getVL();
891 
892   for (unsigned n=0; n < len; n++)
893     xmm_pabsb(&op.vmm128(n));
894 
895   avx512_write_regb_masked(i, &op, len, BX_READ_OPMASK(i->opmask()));
896   BX_NEXT_INSTR(i);
897 }
898 
VPABSW_MASK_VdqWdqR(bxInstruction_c * i)899 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSW_MASK_VdqWdqR(bxInstruction_c *i)
900 {
901   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
902   unsigned len = i->getVL();
903 
904   for (unsigned n=0; n < len; n++)
905     xmm_pabsw(&op.vmm128(n));
906 
907   avx512_write_regw_masked(i, &op, len, BX_READ_32BIT_OPMASK(i->opmask()));
908   BX_NEXT_INSTR(i);
909 }
910 
VPABSD_MASK_VdqWdqR(bxInstruction_c * i)911 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSD_MASK_VdqWdqR(bxInstruction_c *i)
912 {
913   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
914   unsigned len = i->getVL();
915 
916   for (unsigned n=0; n < len; n++)
917     xmm_pabsd(&op.vmm128(n));
918 
919   avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
920   BX_NEXT_INSTR(i);
921 }
922 
VPABSQ_MASK_VdqWdqR(bxInstruction_c * i)923 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPABSQ_MASK_VdqWdqR(bxInstruction_c *i)
924 {
925   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
926   unsigned len = i->getVL();
927 
928   for (unsigned n=0; n < len; n++)
929     xmm_pabsq(&op.vmm128(n));
930 
931   avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
932   BX_NEXT_INSTR(i);
933 }
934 
935 // shuffle and permute
936 
VPSHUFHW_MASK_VdqWdqIbR(bxInstruction_c * i)937 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFHW_MASK_VdqWdqIbR(bxInstruction_c *i)
938 {
939   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
940   Bit8u order = i->Ib();
941   unsigned len = i->getVL();
942 
943   for (unsigned n=0; n < len; n++)
944     xmm_pshufhw(&result.vmm128(n), &op.vmm128(n), order);
945 
946   avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
947   BX_NEXT_INSTR(i);
948 }
949 
VPSHUFLW_MASK_VdqWdqIbR(bxInstruction_c * i)950 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFLW_MASK_VdqWdqIbR(bxInstruction_c *i)
951 {
952   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
953   Bit8u order = i->Ib();
954   unsigned len = i->getVL();
955 
956   for (unsigned n=0; n < len; n++)
957     xmm_pshuflw(&result.vmm128(n), &op.vmm128(n), order);
958 
959   avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
960   BX_NEXT_INSTR(i);
961 }
962 
VPSHUFB_MASK_VdqHdqWdqR(bxInstruction_c * i)963 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPSHUFB_MASK_VdqHdqWdqR(bxInstruction_c *i)
964 {
965   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
966   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
967   unsigned len = i->getVL();
968 
969   for (unsigned n=0; n < len; n++)
970     xmm_pshufb(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
971 
972   avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
973   BX_NEXT_INSTR(i);
974 }
975 
VSHUFPS_MASK_VpsHpsWpsIbR(bxInstruction_c * i)976 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFPS_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
977 {
978   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
979   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
980   unsigned len = i->getVL();
981 
982   for (unsigned n=0; n < len; n++)
983     xmm_shufps(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n), i->Ib());
984 
985   avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
986   BX_NEXT_INSTR(i);
987 }
988 
VSHUFPD_MASK_VpdHpdWpdIbR(bxInstruction_c * i)989 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFPD_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
990 {
991   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
992   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
993 
994   unsigned len = i->getVL();
995   Bit8u order = i->Ib();
996 
997   for (unsigned n=0; n < len; n++) {
998     xmm_shufpd(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n), order);
999     order >>= 2;
1000   }
1001 
1002   avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1003   BX_NEXT_INSTR(i);
1004 }
1005 
VPERMILPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1006 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1007 {
1008   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1009   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1010   unsigned len = i->getVL();
1011 
1012   for (unsigned n=0; n < len; n++)
1013     xmm_permilps(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
1014 
1015   avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1016   BX_NEXT_INSTR(i);
1017 }
1018 
VPERMILPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1019 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1020 {
1021   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1022   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1023   unsigned len = i->getVL();
1024 
1025   for (unsigned n=0; n < len; n++)
1026     xmm_permilpd(&result.vmm128(n), &op1.vmm128(n), &op2.vmm128(n));
1027 
1028   avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1029   BX_NEXT_INSTR(i);
1030 }
1031 
VPERMILPS_MASK_VpsWpsIbR(bxInstruction_c * i)1032 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPS_MASK_VpsWpsIbR(bxInstruction_c *i)
1033 {
1034   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src()), result;
1035   unsigned len = i->getVL();
1036 
1037   for (unsigned n=0; n < len; n++)
1038     xmm_shufps(&result.vmm128(n), &op1.vmm128(n), &op1.vmm128(n), i->Ib());
1039 
1040   avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1041   BX_NEXT_INSTR(i);
1042 }
1043 
VPERMILPD_MASK_VpdWpdIbR(bxInstruction_c * i)1044 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMILPD_MASK_VpdWpdIbR(bxInstruction_c *i)
1045 {
1046   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src()), result;
1047   unsigned len = i->getVL();
1048   Bit8u order = i->Ib();
1049 
1050   for (unsigned n=0; n < len; n++) {
1051     xmm_shufpd(&result.vmm128(n), &op1.vmm128(n), &op1.vmm128(n), order);
1052     order >>= 2;
1053   }
1054 
1055   avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1056   BX_NEXT_INSTR(i);
1057 }
1058 
VSHUFF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1059 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1060 {
1061   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1062   unsigned len = i->getVL();
1063   Bit8u order = i->Ib();
1064 
1065   if (len == BX_VL256) {
1066     result.vmm128(0) = op1.vmm128(order & 0x1);
1067     result.vmm128(1) = op2.vmm128((order>>1) & 0x1);
1068   }
1069   else {
1070     result.vmm128(0) = op1.vmm128(order & 0x3);
1071     result.vmm128(1) = op1.vmm128((order>>2) & 0x3);
1072     result.vmm128(2) = op2.vmm128((order>>4) & 0x3);
1073     result.vmm128(3) = op2.vmm128((order>>6) & 0x3);
1074   }
1075 
1076   if (i->opmask()) {
1077     avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1078   }
1079   else {
1080     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1081   }
1082 
1083   BX_NEXT_INSTR(i);
1084 }
1085 
VSHUFF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1086 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSHUFF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1087 {
1088   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1089   unsigned len = i->getVL();
1090   Bit8u order = i->Ib();
1091 
1092   if (len == BX_VL256) {
1093     result.vmm128(0) = op1.vmm128(order & 0x1);
1094     result.vmm128(1) = op2.vmm128((order>>1) & 0x1);
1095   }
1096   else {
1097     result.vmm128(0) = op1.vmm128(order & 0x3);
1098     result.vmm128(1) = op1.vmm128((order>>2) & 0x3);
1099     result.vmm128(2) = op2.vmm128((order>>4) & 0x3);
1100     result.vmm128(3) = op2.vmm128((order>>6) & 0x3);
1101   }
1102 
1103   if (i->opmask()) {
1104     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1105   }
1106   else {
1107     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1108   }
1109 
1110   BX_NEXT_INSTR(i);
1111 }
1112 
VPALIGNR_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1113 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPALIGNR_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1114 {
1115   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1116   unsigned len = i->getVL();
1117 
1118   for (unsigned n=0; n<len; n++)
1119     xmm_palignr(&op2.vmm128(n), &op1.vmm128(n), i->Ib());
1120 
1121   avx512_write_regb_masked(i, &op2, len, BX_READ_OPMASK(i->opmask()));
1122 
1123   BX_NEXT_INSTR(i);
1124 }
1125 
VALIGND_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1126 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VALIGND_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1127 {
1128   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1129   unsigned len = i->getVL(), elements_mask = DWORD_ELEMENTS(len) - 1;
1130   unsigned shift = i->Ib() & elements_mask;
1131 
1132   for (unsigned n=0; n <= elements_mask; n++) {
1133     unsigned index = (shift + n) & elements_mask;
1134     result.vmm32u(n) = ((n + shift) <= elements_mask) ? op2.vmm32u(index) : op1.vmm32u(index);
1135   }
1136 
1137   if (i->opmask()) {
1138     avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1139   }
1140   else {
1141     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1142   }
1143 
1144   BX_NEXT_INSTR(i);
1145 }
1146 
VALIGNQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1147 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VALIGNQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1148 {
1149   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), result;
1150   unsigned len = i->getVL(), elements_mask = QWORD_ELEMENTS(len) - 1;
1151   unsigned shift = i->Ib() & elements_mask;
1152 
1153   for (unsigned n=0; n <= elements_mask; n++) {
1154     unsigned index = (shift + n) & elements_mask;
1155     result.vmm64u(n) = ((n + shift) <= elements_mask) ? op2.vmm64u(index) : op1.vmm64u(index);
1156   }
1157 
1158   if (i->opmask()) {
1159     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1160   }
1161   else {
1162     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1163   }
1164 
1165   BX_NEXT_INSTR(i);
1166 }
1167 
VPERMQ_MASK_VdqWdqIbR(bxInstruction_c * i)1168 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMQ_MASK_VdqWdqIbR(bxInstruction_c *i)
1169 {
1170   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1171   Bit8u control = i->Ib();
1172   unsigned len = i->getVL();
1173 
1174   ymm_vpermq(&result.vmm256(0), &op.vmm256(0), control);
1175   if (len == BX_VL512)
1176     ymm_vpermq(&result.vmm256(1), &op.vmm256(1), control);
1177 
1178   if (i->opmask()) {
1179     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1180   }
1181   else {
1182     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1183   }
1184 
1185   BX_NEXT_INSTR(i);
1186 }
1187 
VPERMT2B_MASK_VdqHdqWdqR(bxInstruction_c * i)1188 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2B_MASK_VdqHdqWdqR(bxInstruction_c *i)
1189 {
1190   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1191   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1192   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1193   unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1194   unsigned shuffle_control_mask = elements - 1;
1195 
1196   for (unsigned n=0; n < elements; n++) {
1197     unsigned shuffle_control = (unsigned) (op1.vmmubyte(n) & shuffle_control_mask);
1198     result.vmmubyte(n) = (op1.vmmubyte(n) & elements) ? op2.vmmubyte(shuffle_control) : dst.vmmubyte(shuffle_control);
1199   }
1200 
1201   if (i->opmask()) {
1202     avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1203   }
1204   else {
1205     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1206   }
1207 
1208   BX_NEXT_INSTR(i);
1209 }
1210 
VPERMT2W_MASK_VdqHdqWdqR(bxInstruction_c * i)1211 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2W_MASK_VdqHdqWdqR(bxInstruction_c *i)
1212 {
1213   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1214   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1215   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1216   unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1217   unsigned shuffle_control_mask = elements - 1;
1218 
1219   for (unsigned n=0; n < elements; n++) {
1220     unsigned shuffle_control = (unsigned) (op1.vmm16u(n) & shuffle_control_mask);
1221     result.vmm16u(n) = (op1.vmm16u(n) & elements) ? op2.vmm16u(shuffle_control) : dst.vmm16u(shuffle_control);
1222   }
1223 
1224   if (i->opmask()) {
1225     avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1226   }
1227   else {
1228     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1229   }
1230 
1231   BX_NEXT_INSTR(i);
1232 }
1233 
VPERMT2PS_MASK_VpsHpsWpsR(bxInstruction_c * i)1234 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2PS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1235 {
1236   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1237   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1238   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1239   unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1240   unsigned shuffle_control_mask = elements - 1;
1241 
1242   for (unsigned n=0; n < elements; n++) {
1243     unsigned shuffle_control = (unsigned) (op1.vmm32u(n) & shuffle_control_mask);
1244     result.vmm32u(n) = (op1.vmm32u(n) & elements) ? op2.vmm32u(shuffle_control) : dst.vmm32u(shuffle_control);
1245   }
1246 
1247   if (i->opmask()) {
1248     avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1249   }
1250   else {
1251     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1252   }
1253 
1254   BX_NEXT_INSTR(i);
1255 }
1256 
VPERMT2PD_MASK_VpdHpdWpdR(bxInstruction_c * i)1257 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMT2PD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1258 {
1259   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1260   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1261   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1262   unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1263   unsigned shuffle_control_mask = elements - 1;
1264 
1265   for (unsigned n=0; n < elements; n++) {
1266     unsigned shuffle_control = (unsigned) (op1.vmm64u(n) & shuffle_control_mask);
1267     result.vmm64u(n) = (op1.vmm64u(n) & elements) ? op2.vmm64u(shuffle_control) : dst.vmm64u(shuffle_control);
1268   }
1269 
1270   if (i->opmask()) {
1271     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1272   }
1273   else {
1274     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1275   }
1276 
1277   BX_NEXT_INSTR(i);
1278 }
1279 
VPERMI2B_MASK_VdqHdqWdqR(bxInstruction_c * i)1280 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2B_MASK_VdqHdqWdqR(bxInstruction_c *i)
1281 {
1282   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1283   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1284   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1285   unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1286   unsigned shuffle_control_mask = elements - 1;
1287 
1288   for (unsigned n=0; n < elements; n++) {
1289     unsigned shuffle_control = (unsigned) (dst.vmmubyte(n) & shuffle_control_mask);
1290     result.vmmubyte(n) = (dst.vmmubyte(n) & elements) ? op2.vmmubyte(shuffle_control) : op1.vmmubyte(shuffle_control);
1291   }
1292 
1293   if (i->opmask()) {
1294     avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1295   }
1296   else {
1297     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1298   }
1299 
1300   BX_NEXT_INSTR(i);
1301 }
1302 
VPERMI2W_MASK_VdqHdqWdqR(bxInstruction_c * i)1303 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2W_MASK_VdqHdqWdqR(bxInstruction_c *i)
1304 {
1305   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1306   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1307   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1308   unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1309   unsigned shuffle_control_mask = elements - 1;
1310 
1311   for (unsigned n=0; n < elements; n++) {
1312     unsigned shuffle_control = (unsigned) (dst.vmm16u(n) & shuffle_control_mask);
1313     result.vmm16u(n) = (dst.vmm16u(n) & elements) ? op2.vmm16u(shuffle_control) : op1.vmm16u(shuffle_control);
1314   }
1315 
1316   if (i->opmask()) {
1317     avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1318   }
1319   else {
1320     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1321   }
1322 
1323   BX_NEXT_INSTR(i);
1324 }
1325 
VPERMI2PS_MASK_VpsHpsWpsR(bxInstruction_c * i)1326 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2PS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1327 {
1328   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1329   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1330   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1331   unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1332   unsigned shuffle_control_mask = elements - 1;
1333 
1334   for (unsigned n=0; n < elements; n++) {
1335     unsigned shuffle_control = (unsigned) (dst.vmm32u(n) & shuffle_control_mask);
1336     result.vmm32u(n) = (dst.vmm32u(n) & elements) ? op2.vmm32u(shuffle_control) : op1.vmm32u(shuffle_control);
1337   }
1338 
1339   if (i->opmask()) {
1340     avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1341   }
1342   else {
1343     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1344   }
1345 
1346   BX_NEXT_INSTR(i);
1347 }
1348 
VPERMI2PD_MASK_VpdHpdWpdR(bxInstruction_c * i)1349 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMI2PD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1350 {
1351   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1352   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2());
1353   BxPackedAvxRegister dst = BX_READ_AVX_REG(i->dst()), result;
1354   unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1355   unsigned shuffle_control_mask = elements - 1;
1356 
1357   for (unsigned n=0; n < elements; n++) {
1358     unsigned shuffle_control = (unsigned) (dst.vmm64u(n) & shuffle_control_mask);
1359     result.vmm64u(n) = (dst.vmm64u(n) & elements) ? op2.vmm64u(shuffle_control) : op1.vmm64u(shuffle_control);
1360   }
1361 
1362   if (i->opmask()) {
1363     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1364   }
1365   else {
1366     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1367   }
1368 
1369   BX_NEXT_INSTR(i);
1370 }
1371 
VPERMB_MASK_VdqHdqWdqR(bxInstruction_c * i)1372 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMB_MASK_VdqHdqWdqR(bxInstruction_c *i)
1373 {
1374   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1375   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1376   unsigned len = i->getVL(), elements = BYTE_ELEMENTS(len);
1377   unsigned shuffle_control_mask = elements - 1;
1378 
1379   for (unsigned n=0;n < elements;n++)
1380     result.vmmubyte(n) = op2.vmmubyte(op1.vmmubyte(n) & shuffle_control_mask);
1381 
1382   if (i->opmask()) {
1383     avx512_write_regb_masked(i, &result, len, BX_READ_OPMASK(i->opmask()));
1384   }
1385   else {
1386     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1387   }
1388 
1389   BX_NEXT_INSTR(i);
1390 }
1391 
VPERMW_MASK_VdqHdqWdqR(bxInstruction_c * i)1392 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMW_MASK_VdqHdqWdqR(bxInstruction_c *i)
1393 {
1394   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1395   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1396   unsigned len = i->getVL(), elements = WORD_ELEMENTS(len);
1397   unsigned shuffle_control_mask = elements - 1;
1398 
1399   for (unsigned n=0;n < elements;n++)
1400     result.vmm16u(n) = op2.vmm16u(op1.vmm16u(n) & shuffle_control_mask);
1401 
1402   if (i->opmask()) {
1403     avx512_write_regw_masked(i, &result, len, BX_READ_32BIT_OPMASK(i->opmask()));
1404   }
1405   else {
1406     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1407   }
1408 
1409   BX_NEXT_INSTR(i);
1410 }
1411 
VPERMPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1412 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1413 {
1414   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1415   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1416   unsigned len = i->getVL(), elements = DWORD_ELEMENTS(len);
1417   unsigned shuffle_control_mask = elements - 1;
1418 
1419   for (unsigned n=0;n < elements;n++)
1420     result.vmm32u(n) = op2.vmm32u(op1.vmm32u(n) & shuffle_control_mask);
1421 
1422   if (i->opmask()) {
1423     avx512_write_regd_masked(i, &result, len, BX_READ_16BIT_OPMASK(i->opmask()));
1424   }
1425   else {
1426     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1427   }
1428 
1429   BX_NEXT_INSTR(i);
1430 }
1431 
VPERMPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1432 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1433 {
1434   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1());
1435   BxPackedAvxRegister op2 = BX_READ_AVX_REG(i->src2()), result;
1436   unsigned len = i->getVL(), elements = QWORD_ELEMENTS(len);
1437   unsigned shuffle_control_mask = elements - 1;
1438 
1439   for (unsigned n=0;n < elements;n++)
1440     result.vmm64u(n) = op2.vmm64u(op1.vmm64u(n) & shuffle_control_mask);
1441 
1442   if (i->opmask()) {
1443     avx512_write_regq_masked(i, &result, len, BX_READ_8BIT_OPMASK(i->opmask()));
1444   }
1445   else {
1446     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1447   }
1448 
1449   BX_NEXT_INSTR(i);
1450 }
1451 
VINSERTF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1452 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF32x4_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1453 {
1454   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1455   unsigned len = i->getVL();
1456   unsigned offset = i->Ib() & (len-1);
1457 
1458   op.vmm128(offset) = BX_READ_XMM_REG(i->src2());
1459   avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1460   BX_NEXT_INSTR(i);
1461 }
1462 
VINSERTF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1463 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x2_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1464 {
1465   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1466   unsigned len = i->getVL();
1467   unsigned offset = i->Ib() & (len-1);
1468 
1469   op.vmm128(offset) = BX_READ_XMM_REG(i->src2());
1470   avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
1471 
1472   BX_NEXT_INSTR(i);
1473 }
1474 
VINSERTF64x4_VpdHpdWpdIbR(bxInstruction_c * i)1475 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x4_VpdHpdWpdIbR(bxInstruction_c *i)
1476 {
1477   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1478   op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1479   BX_WRITE_AVX_REGZ(i->dst(), op, BX_VL512);
1480 
1481   BX_NEXT_INSTR(i);
1482 }
1483 
VINSERTF64x4_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1484 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF64x4_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1485 {
1486   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1487   op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1488   avx512_write_regq_masked(i, &op, BX_VL512, BX_READ_8BIT_OPMASK(i->opmask()));
1489 
1490   BX_NEXT_INSTR(i);
1491 }
1492 
VINSERTF32x8_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1493 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VINSERTF32x8_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1494 {
1495   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src1());
1496   op.vmm256(i->Ib() & 0x1) = BX_READ_YMM_REG(i->src2());
1497   avx512_write_regd_masked(i, &op, BX_VL512, BX_READ_16BIT_OPMASK(i->opmask()));
1498 
1499   BX_NEXT_INSTR(i);
1500 }
1501 
VEXTRACTF32x4_MASK_WpsVpsIbR(bxInstruction_c * i)1502 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x4_MASK_WpsVpsIbR(bxInstruction_c *i)
1503 {
1504   unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1505   BxPackedXmmRegister op = BX_READ_AVX_REG_LANE(i->src(), offset);
1506 
1507   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
1508 
1509   if (i->isZeroMasking())
1510      xmm_zero_blendps(&BX_READ_XMM_REG(i->dst()), &op, mask);
1511   else
1512      xmm_blendps(&BX_READ_XMM_REG(i->dst()), &op, mask);
1513 
1514   BX_CLEAR_AVX_HIGH128(i->dst());
1515   BX_NEXT_INSTR(i);
1516 }
1517 
VEXTRACTF32x4_MASK_WpsVpsIbM(bxInstruction_c * i)1518 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x4_MASK_WpsVpsIbM(bxInstruction_c *i)
1519 {
1520   unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1521   BxPackedAvxRegister op;
1522   op.vmm128(0) = BX_READ_AVX_REG_LANE(i->src(), offset);
1523 
1524   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0xf;
1525   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1526   avx_masked_store32(i, eaddr, &op, opmask);
1527 
1528   BX_NEXT_INSTR(i);
1529 }
1530 
VEXTRACTF64x2_MASK_WpdVpdIbR(bxInstruction_c * i)1531 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x2_MASK_WpdVpdIbR(bxInstruction_c *i)
1532 {
1533   unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1534   BxPackedXmmRegister op = BX_READ_AVX_REG_LANE(i->src(), offset);
1535 
1536   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
1537 
1538   if (i->isZeroMasking())
1539      xmm_zero_blendpd(&BX_READ_XMM_REG(i->dst()), &op, mask);
1540   else
1541      xmm_blendpd(&BX_READ_XMM_REG(i->dst()), &op, mask);
1542 
1543   BX_CLEAR_AVX_HIGH128(i->dst());
1544   BX_NEXT_INSTR(i);
1545 }
1546 
VEXTRACTF64x2_MASK_WpdVpdIbM(bxInstruction_c * i)1547 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x2_MASK_WpdVpdIbM(bxInstruction_c *i)
1548 {
1549   unsigned len = i->getVL(), offset = i->Ib() & (len - 1);
1550   BxPackedAvxRegister op;
1551   op.vmm128(0) = BX_READ_AVX_REG_LANE(i->src(), offset);
1552 
1553   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0x3;
1554   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1555   avx_masked_store64(i, eaddr, &op, opmask);
1556 
1557   BX_NEXT_INSTR(i);
1558 }
1559 
VEXTRACTF64x4_WpdVpdIbR(bxInstruction_c * i)1560 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_WpdVpdIbR(bxInstruction_c *i)
1561 {
1562   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1563   BX_WRITE_YMM_REGZ(i->dst(), op.vmm256(i->Ib() & 0x1));
1564   BX_NEXT_INSTR(i);
1565 }
1566 
VEXTRACTF64x4_MASK_WpdVpdIbR(bxInstruction_c * i)1567 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_MASK_WpdVpdIbR(bxInstruction_c *i)
1568 {
1569   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1570   if (i->Ib() & 0x1)
1571     op.vmm256(0) = op.vmm256(1);
1572 
1573   avx512_write_regq_masked(i, &op, BX_VL256, BX_READ_8BIT_OPMASK(i->opmask()));
1574   BX_NEXT_INSTR(i);
1575 }
1576 
VEXTRACTF64x4_WpdVpdIbM(bxInstruction_c * i)1577 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_WpdVpdIbM(bxInstruction_c *i)
1578 {
1579   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1580   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1581   write_virtual_ymmword(i->seg(), eaddr, &op.vmm256(i->Ib() & 0x1));
1582   BX_NEXT_INSTR(i);
1583 }
1584 
VEXTRACTF64x4_MASK_WpdVpdIbM(bxInstruction_c * i)1585 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF64x4_MASK_WpdVpdIbM(bxInstruction_c *i)
1586 {
1587   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1588   if (i->Ib() & 0x1)
1589     op.vmm256(0) = op.vmm256(1);
1590 
1591   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()) & 0xf;
1592 
1593   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1594   avx_masked_store64(i, eaddr, &op, opmask);
1595   BX_NEXT_INSTR(i);
1596 }
1597 
VEXTRACTF32x8_MASK_WpsVpsIbR(bxInstruction_c * i)1598 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x8_MASK_WpsVpsIbR(bxInstruction_c *i)
1599 {
1600   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1601   if (i->Ib() & 0x1)
1602     op.vmm256(0) = op.vmm256(1);
1603 
1604   avx512_write_regd_masked(i, &op, BX_VL256, BX_READ_8BIT_OPMASK(i->opmask()));
1605   BX_NEXT_INSTR(i);
1606 }
1607 
VEXTRACTF32x8_MASK_WpsVpsIbM(bxInstruction_c * i)1608 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXTRACTF32x8_MASK_WpsVpsIbM(bxInstruction_c *i)
1609 {
1610   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1611   if (i->Ib() & 0x1)
1612     op.vmm256(0) = op.vmm256(1);
1613 
1614   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
1615   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
1616   avx_masked_store32(i, eaddr, &op, opmask);
1617   BX_NEXT_INSTR(i);
1618 }
1619 
VMOVDDUP_MASK_VpdWpdR(bxInstruction_c * i)1620 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVDDUP_MASK_VpdWpdR(bxInstruction_c *i)
1621 {
1622   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1623   unsigned len = i->getVL();
1624 
1625   for (unsigned n=0; n < QWORD_ELEMENTS(len); n+=2) {
1626     op.vmm64u(n+1) = op.vmm64u(n);
1627   }
1628 
1629   avx512_write_regq_masked(i, &op, len, BX_READ_8BIT_OPMASK(i->opmask()));
1630 
1631   BX_NEXT_INSTR(i);
1632 }
1633 
VMOVSLDUP_MASK_VpsWpsR(bxInstruction_c * i)1634 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVSLDUP_MASK_VpsWpsR(bxInstruction_c *i)
1635 {
1636   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1637   unsigned len = i->getVL();
1638 
1639   for (unsigned n=0; n < DWORD_ELEMENTS(len); n+=2) {
1640     op.vmm32u(n+1) = op.vmm32u(n);
1641   }
1642 
1643   avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1644 
1645   BX_NEXT_INSTR(i);
1646 }
1647 
VMOVSHDUP_MASK_VpsWpsR(bxInstruction_c * i)1648 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VMOVSHDUP_MASK_VpsWpsR(bxInstruction_c *i)
1649 {
1650   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1651   unsigned len = i->getVL();
1652 
1653   for (unsigned n=0; n < DWORD_ELEMENTS(len); n+=2) {
1654     op.vmm32u(n) = op.vmm32u(n+1);
1655   }
1656 
1657   avx512_write_regd_masked(i, &op, len, BX_READ_16BIT_OPMASK(i->opmask()));
1658 
1659   BX_NEXT_INSTR(i);
1660 }
1661 
1662 // special bit operations
1663 
ternlogd_scalar(Bit32u op1,Bit32u op2,Bit32u op3,unsigned imm8)1664 BX_CPP_INLINE Bit32u ternlogd_scalar(Bit32u op1, Bit32u op2, Bit32u op3, unsigned imm8)
1665 {
1666   Bit32u result = 0;
1667 
1668   for (unsigned bit = 0; bit < 32; bit++) {
1669     unsigned tmp  = (op1 >> bit) & 0x1;
1670              tmp <<= 1;
1671              tmp |= (op2 >> bit) & 0x1;
1672              tmp <<= 1;
1673              tmp |= (op3 >> bit) & 0x1;
1674 
1675     result |= ((Bit32u)((imm8 >> tmp) & 0x1)) << bit;
1676   }
1677 
1678   return result;
1679 }
1680 
ternlogq_scalar(Bit64u op1,Bit64u op2,Bit64u op3,unsigned imm8)1681 BX_CPP_INLINE Bit64u ternlogq_scalar(Bit64u op1, Bit64u op2, Bit64u op3, unsigned imm8)
1682 {
1683   Bit64u result = 0;
1684 
1685   for (unsigned bit = 0; bit < 64; bit++) {
1686     unsigned tmp  = (op1 >> bit) & 0x1;
1687              tmp <<= 1;
1688              tmp |= (op2 >> bit) & 0x1;
1689              tmp <<= 1;
1690              tmp |= (op3 >> bit) & 0x1;
1691 
1692     result |= ((Bit64u)((imm8 >> tmp) & 0x1)) << bit;
1693   }
1694 
1695   return result;
1696 }
1697 
VPTERNLOGD_VdqHdqWdqIbR(bxInstruction_c * i)1698 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGD_VdqHdqWdqIbR(bxInstruction_c *i)
1699 {
1700   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1701                       op2 = BX_READ_AVX_REG(i->src1()),
1702                       op3 = BX_READ_AVX_REG(i->src2());
1703 
1704   unsigned len = i->getVL(), num_elements = DWORD_ELEMENTS(len);
1705   Bit8u imm8 = i->Ib();
1706 
1707   for (unsigned n=0; n < num_elements; n++) {
1708     op1.vmm32u(n) = ternlogd_scalar(op1.vmm32u(n), op2.vmm32u(n), op3.vmm32u(n), imm8);
1709   }
1710 
1711   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1712   BX_NEXT_INSTR(i);
1713 }
1714 
VPTERNLOGD_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1715 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGD_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1716 {
1717   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1718                       op2 = BX_READ_AVX_REG(i->src1()),
1719                       op3 = BX_READ_AVX_REG(i->src2());
1720 
1721   unsigned len = i->getVL(), num_elements = DWORD_ELEMENTS(len);
1722   Bit8u imm8 = i->Ib();
1723 
1724   Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask());
1725 
1726   for (unsigned n=0; n < num_elements; n++, opmask >>= 1) {
1727     if (opmask & 0x1)
1728       op1.vmm32u(n) = ternlogd_scalar(op1.vmm32u(n), op2.vmm32u(n), op3.vmm32u(n), imm8);
1729     else
1730       if (i->isZeroMasking()) op1.vmm32u(n) = 0;
1731   }
1732 
1733   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1734   BX_NEXT_INSTR(i);
1735 }
1736 
VPTERNLOGQ_VdqHdqWdqIbR(bxInstruction_c * i)1737 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGQ_VdqHdqWdqIbR(bxInstruction_c *i)
1738 {
1739   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1740                       op2 = BX_READ_AVX_REG(i->src1()),
1741                       op3 = BX_READ_AVX_REG(i->src2());
1742 
1743   unsigned len = i->getVL(), num_elements = QWORD_ELEMENTS(len);
1744   Bit8u imm8 = i->Ib();
1745 
1746   for (unsigned n=0; n < num_elements; n++) {
1747     op1.vmm64u(n) = ternlogq_scalar(op1.vmm64u(n), op2.vmm64u(n), op3.vmm64u(n), imm8);
1748   }
1749 
1750   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1751   BX_NEXT_INSTR(i);
1752 }
1753 
VPTERNLOGQ_MASK_VdqHdqWdqIbR(bxInstruction_c * i)1754 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPTERNLOGQ_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
1755 {
1756   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->dst()),
1757                       op2 = BX_READ_AVX_REG(i->src1()),
1758                       op3 = BX_READ_AVX_REG(i->src2());
1759 
1760   unsigned len = i->getVL(), num_elements = QWORD_ELEMENTS(len);
1761   Bit8u imm8 = i->Ib();
1762 
1763   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
1764 
1765   for (unsigned n=0; n < num_elements; n++, opmask >>= 1) {
1766     if (opmask & 0x1)
1767       op1.vmm64u(n) = ternlogq_scalar(op1.vmm64u(n), op2.vmm64u(n), op3.vmm64u(n), imm8);
1768     else
1769       if (i->isZeroMasking()) op1.vmm64u(n) = 0;
1770   }
1771 
1772   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1773   BX_NEXT_INSTR(i);
1774 }
1775 
1776 // blend
1777 
VPBLENDMB_MASK_VdqHdqWdqR(bxInstruction_c * i)1778 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPBLENDMB_MASK_VdqHdqWdqR(bxInstruction_c *i)
1779 {
1780   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1781   unsigned len = i->getVL();
1782 
1783   Bit64u opmask = (i->opmask() != 0) ? BX_READ_OPMASK(i->opmask()) : BX_CONST64(0xffffffffffffffff);
1784 
1785   if (i->isZeroMasking()) {
1786     for (unsigned n=0; n < len; n++, opmask >>= 16)
1787       xmm_zero_pblendb(&op1.vmm128(n), &op2.vmm128(n), opmask);
1788   }
1789   else {
1790     for (unsigned n=0; n < len; n++, opmask >>= 16)
1791       xmm_pblendb(&op1.vmm128(n), &op2.vmm128(n), opmask);
1792   }
1793 
1794   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1795   BX_NEXT_INSTR(i);
1796 }
1797 
VPBLENDMW_MASK_VdqHdqWdqR(bxInstruction_c * i)1798 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPBLENDMW_MASK_VdqHdqWdqR(bxInstruction_c *i)
1799 {
1800   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1801   unsigned len = i->getVL();
1802 
1803   Bit32u opmask = (i->opmask() != 0) ? BX_READ_32BIT_OPMASK(i->opmask()) : 0xffffffff;
1804 
1805   if (i->isZeroMasking()) {
1806     for (unsigned n=0; n < len; n++, opmask >>= 8)
1807       xmm_zero_pblendw(&op1.vmm128(n), &op2.vmm128(n), opmask);
1808   }
1809   else {
1810     for (unsigned n=0; n < len; n++, opmask >>= 8)
1811       xmm_pblendw(&op1.vmm128(n), &op2.vmm128(n), opmask);
1812   }
1813 
1814   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1815   BX_NEXT_INSTR(i);
1816 }
1817 
VBLENDMPS_MASK_VpsHpsWpsR(bxInstruction_c * i)1818 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VBLENDMPS_MASK_VpsHpsWpsR(bxInstruction_c *i)
1819 {
1820   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1821   unsigned len = i->getVL();
1822 
1823   Bit32u opmask = (i->opmask() != 0) ? BX_READ_16BIT_OPMASK(i->opmask()) : 0xffff;
1824 
1825   if (i->isZeroMasking()) {
1826     for (unsigned n=0; n < len; n++, opmask >>= 4)
1827       xmm_zero_blendps(&op1.vmm128(n), &op2.vmm128(n), opmask);
1828   }
1829   else {
1830     for (unsigned n=0; n < len; n++, opmask >>= 4)
1831       xmm_blendps(&op1.vmm128(n), &op2.vmm128(n), opmask);
1832   }
1833 
1834   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1835   BX_NEXT_INSTR(i);
1836 }
1837 
VBLENDMPD_MASK_VpdHpdWpdR(bxInstruction_c * i)1838 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VBLENDMPD_MASK_VpdHpdWpdR(bxInstruction_c *i)
1839 {
1840   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1841   unsigned len = i->getVL();
1842 
1843   Bit32u opmask = (i->opmask() != 0) ? BX_READ_8BIT_OPMASK(i->opmask()) : 0xff;
1844 
1845   if (i->isZeroMasking()) {
1846     for (unsigned n=0; n < len; n++, opmask >>= 2)
1847       xmm_zero_blendpd(&op1.vmm128(n), &op2.vmm128(n), opmask);
1848   }
1849   else {
1850     for (unsigned n=0; n < len; n++, opmask >>= 2)
1851       xmm_blendpd(&op1.vmm128(n), &op2.vmm128(n), opmask);
1852   }
1853 
1854   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1855   BX_NEXT_INSTR(i);
1856 }
1857 
1858 // compress, expand
1859 
VPEXPANDB_MASK_VdqWdqR(bxInstruction_c * i)1860 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDB_MASK_VdqWdqR(bxInstruction_c *i)
1861 {
1862   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1863 
1864   Bit64u opmask = BX_READ_OPMASK(i->opmask()), mask = opmask;
1865   unsigned len = i->getVL(), n = 0, k = 0;
1866 
1867   for (; n < len*16; n++, mask >>= 1) {
1868     if (mask & 0x1) {
1869       result.vmmubyte(n) = op.vmmubyte(k);
1870       k++;
1871     }
1872     else {
1873       result.vmmubyte(n) = 0;
1874     }
1875   }
1876 
1877   if (i->isZeroMasking()) {
1878     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1879   }
1880   else {
1881     for (unsigned n=0; n < len; n++, opmask >>= 16)
1882       xmm_pblendb(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1883 
1884     BX_CLEAR_AVX_REGZ(i->dst(), len);
1885   }
1886 
1887   BX_NEXT_INSTR(i);
1888 }
1889 
VPEXPANDW_MASK_VdqWdqR(bxInstruction_c * i)1890 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDW_MASK_VdqWdqR(bxInstruction_c *i)
1891 {
1892   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1893 
1894   Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask()), mask = opmask;
1895   unsigned len = i->getVL(), n = 0, k = 0;
1896 
1897   for (; n < len*8; n++, mask >>= 1) {
1898     if (mask & 0x1) {
1899       result.vmm16u(n) = op.vmm16u(k);
1900       k++;
1901     }
1902     else {
1903       result.vmm16u(n) = 0;
1904     }
1905   }
1906 
1907   if (i->isZeroMasking()) {
1908     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1909   }
1910   else {
1911     for (unsigned n=0; n < len; n++, opmask >>= 8)
1912       xmm_pblendw(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1913 
1914     BX_CLEAR_AVX_REGZ(i->dst(), len);
1915   }
1916 
1917   BX_NEXT_INSTR(i);
1918 }
1919 
VEXPANDPS_MASK_VpsWpsR(bxInstruction_c * i)1920 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPS_MASK_VpsWpsR(bxInstruction_c *i)
1921 {
1922   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1923 
1924   Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask()), mask = opmask;
1925   unsigned len = i->getVL(), n = 0, k = 0;
1926 
1927   for (; n < len*4; n++, mask >>= 1) {
1928     if (mask & 0x1) {
1929       result.vmm32u(n) = op.vmm32u(k);
1930       k++;
1931     }
1932     else {
1933       result.vmm32u(n) = 0;
1934     }
1935   }
1936 
1937   if (i->isZeroMasking()) {
1938     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1939   }
1940   else {
1941     for (unsigned n=0; n < len; n++, opmask >>= 4)
1942       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1943 
1944     BX_CLEAR_AVX_REGZ(i->dst(), len);
1945   }
1946 
1947   BX_NEXT_INSTR(i);
1948 }
1949 
VEXPANDPD_MASK_VpdWpdR(bxInstruction_c * i)1950 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPD_MASK_VpdWpdR(bxInstruction_c *i)
1951 {
1952   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1953 
1954   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()), mask = opmask;
1955   unsigned len = i->getVL(), n = 0, k = 0;
1956 
1957   for (; n < len*2; n++, mask >>= 1) {
1958     if (mask & 0x1) {
1959       result.vmm64u(n) = op.vmm64u(k);
1960       k++;
1961     }
1962     else {
1963       result.vmm64u(n) = 0;
1964     }
1965   }
1966 
1967   if (i->isZeroMasking()) {
1968     BX_WRITE_AVX_REGZ(i->dst(), result, len);
1969   }
1970   else {
1971     for (unsigned n=0; n < len; n++, opmask >>= 2)
1972       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
1973 
1974     BX_CLEAR_AVX_REGZ(i->dst(), len);
1975   }
1976 
1977   BX_NEXT_INSTR(i);
1978 }
1979 
VPCOMPRESSB_MASK_WdqVdq(bxInstruction_c * i)1980 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSB_MASK_WdqVdq(bxInstruction_c *i)
1981 {
1982   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
1983 
1984   Bit64u opmask = BX_READ_OPMASK(i->opmask());
1985   unsigned len = i->getVL(), n = 0, k = 0;
1986 
1987   for (; n < len*16; n++, opmask >>= 1) {
1988     if (opmask & 0x1) {
1989       result.vmmubyte(k) = op.vmmubyte(n);
1990       k++;
1991     }
1992     if (! opmask) break;
1993   }
1994 
1995   Bit64u writemask = (BX_CONST64(1) << k) - 1;
1996 
1997   if (i->modC0()) {
1998     avx512_write_regb_masked(i, &result, len, writemask);
1999   }
2000   else {
2001     bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2002     avx_masked_store8(i, eaddr, &result, writemask);
2003   }
2004 
2005   BX_NEXT_INSTR(i);
2006 }
2007 
VPCOMPRESSW_MASK_WdqVdq(bxInstruction_c * i)2008 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSW_MASK_WdqVdq(bxInstruction_c *i)
2009 {
2010   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2011 
2012   Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask());
2013   unsigned len = i->getVL(), n = 0, k = 0;
2014 
2015   for (; n < len*8; n++, opmask >>= 1) {
2016     if (opmask & 0x1) {
2017       result.vmm16u(k) = op.vmm16u(n);
2018       k++;
2019     }
2020     if (! opmask) break;
2021   }
2022 
2023   Bit32u writemask = (1 << k) - 1;
2024 
2025   if (i->modC0()) {
2026     avx512_write_regw_masked(i, &result, len, writemask);
2027   }
2028   else {
2029     bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2030     avx_masked_store16(i, eaddr, &result, writemask);
2031   }
2032 
2033   BX_NEXT_INSTR(i);
2034 }
2035 
VCOMPRESSPS_MASK_WpsVps(bxInstruction_c * i)2036 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPS_MASK_WpsVps(bxInstruction_c *i)
2037 {
2038   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2039 
2040   Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask());
2041   unsigned len = i->getVL(), n = 0, k = 0;
2042 
2043   for (; n < len*4; n++, opmask >>= 1) {
2044     if (opmask & 0x1) {
2045       result.vmm32u(k) = op.vmm32u(n);
2046       k++;
2047     }
2048     if (! opmask) break;
2049   }
2050 
2051   Bit32u writemask = (1 << k) - 1;
2052 
2053   if (i->modC0()) {
2054     avx512_write_regd_masked(i, &result, len, writemask);
2055   }
2056   else {
2057     bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2058     avx_masked_store32(i, eaddr, &result, writemask);
2059   }
2060 
2061   BX_NEXT_INSTR(i);
2062 }
2063 
VCOMPRESSPD_MASK_WpdVpd(bxInstruction_c * i)2064 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPD_MASK_WpdVpd(bxInstruction_c *i)
2065 {
2066   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
2067 
2068   Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
2069   unsigned len = i->getVL(), n = 0, k = 0;
2070 
2071   for (; n < len*2; n++, opmask >>= 1) {
2072     if (opmask & 0x1) {
2073       result.vmm64u(k) = op.vmm64u(n);
2074       k++;
2075     }
2076     if (! opmask) break;
2077   }
2078 
2079   Bit32u writemask = (1 << k) - 1;
2080 
2081   if (i->modC0()) {
2082     avx512_write_regq_masked(i, &result, len, writemask);
2083   }
2084   else {
2085     bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
2086     avx_masked_store64(i, eaddr, &result, writemask);
2087   }
2088 
2089   BX_NEXT_INSTR(i);
2090 }
2091 
2092 // convert mask
2093 
VPMOVM2B_VdqKEqR(bxInstruction_c * i)2094 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2B_VdqKEqR(bxInstruction_c *i)
2095 {
2096   Bit64u opmask = BX_READ_OPMASK(i->src());
2097   unsigned len = i->getVL();
2098 
2099   for (unsigned n=0; n<len; n++) {
2100     xmm_pmovm2b(&BX_READ_AVX_REG_LANE(i->dst(), n), (Bit32u) opmask);
2101     opmask >>= 16;
2102   }
2103 
2104   BX_CLEAR_AVX_REGZ(i->dst(), len);
2105   BX_NEXT_INSTR(i);
2106 }
2107 
VPMOVM2W_VdqKEdR(bxInstruction_c * i)2108 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2W_VdqKEdR(bxInstruction_c *i)
2109 {
2110   Bit32u opmask = BX_READ_32BIT_OPMASK(i->src());
2111   unsigned len = i->getVL();
2112 
2113   for (unsigned n=0; n<len; n++) {
2114     xmm_pmovm2w(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2115     opmask >>= 8;
2116   }
2117 
2118   BX_CLEAR_AVX_REGZ(i->dst(), len);
2119   BX_NEXT_INSTR(i);
2120 }
2121 
VPMOVM2D_VdqKEwR(bxInstruction_c * i)2122 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2D_VdqKEwR(bxInstruction_c *i)
2123 {
2124   Bit32u opmask = (Bit32u) BX_READ_16BIT_OPMASK(i->src());
2125   unsigned len = i->getVL();
2126 
2127   for (unsigned n=0; n<len; n++) {
2128     xmm_pmovm2d(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2129     opmask >>= 4;
2130   }
2131 
2132   BX_CLEAR_AVX_REGZ(i->dst(), len);
2133   BX_NEXT_INSTR(i);
2134 }
2135 
VPMOVM2Q_VdqKEbR(bxInstruction_c * i)2136 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVM2Q_VdqKEbR(bxInstruction_c *i)
2137 {
2138   Bit32u opmask = (Bit32u) BX_READ_8BIT_OPMASK(i->src());
2139   unsigned len = i->getVL();
2140 
2141   for (unsigned n=0; n<len; n++) {
2142     xmm_pmovm2q(&BX_READ_AVX_REG_LANE(i->dst(), n), opmask);
2143     opmask >>= 2;
2144   }
2145 
2146   BX_CLEAR_AVX_REGZ(i->dst(), len);
2147   BX_NEXT_INSTR(i);
2148 }
2149 
VPMOVB2M_KGqWdqR(bxInstruction_c * i)2150 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVB2M_KGqWdqR(bxInstruction_c *i)
2151 {
2152   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2153   unsigned len = i->getVL();
2154   Bit64u mask = 0;
2155 
2156   for (unsigned n=0; n<len; n++) {
2157     mask |= ((Bit64u) xmm_pmovmskb(&op.vmm128(n))) << (16*n);
2158   }
2159 
2160   BX_WRITE_OPMASK(i->dst(), mask);
2161   BX_NEXT_INSTR(i);
2162 }
2163 
VPMOVW2M_KGdWdqR(bxInstruction_c * i)2164 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVW2M_KGdWdqR(bxInstruction_c *i)
2165 {
2166   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2167   unsigned len = i->getVL();
2168   Bit32u mask = 0;
2169 
2170   for (unsigned n=0; n<len; n++) {
2171     mask |= xmm_pmovmskw(&op.vmm128(n)) << (8*n);
2172   }
2173 
2174   BX_WRITE_OPMASK(i->dst(), mask);
2175   BX_NEXT_INSTR(i);
2176 }
2177 
VPMOVD2M_KGwWdqR(bxInstruction_c * i)2178 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVD2M_KGwWdqR(bxInstruction_c *i)
2179 {
2180   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2181   unsigned len = i->getVL();
2182   Bit32u mask = 0;
2183 
2184   for (unsigned n=0; n<len; n++) {
2185     mask |= xmm_pmovmskd(&op.vmm128(n)) << (4*n);
2186   }
2187 
2188   BX_WRITE_OPMASK(i->dst(), mask);
2189   BX_NEXT_INSTR(i);
2190 }
2191 
VPMOVQ2M_KGbWdqR(bxInstruction_c * i)2192 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMOVQ2M_KGbWdqR(bxInstruction_c *i)
2193 {
2194   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
2195   unsigned len = i->getVL();
2196   Bit32u mask = 0;
2197 
2198   for (unsigned n=0; n<len; n++) {
2199     mask |= xmm_pmovmskq(&op.vmm128(n)) << (2*n);
2200   }
2201 
2202   BX_WRITE_OPMASK(i->dst(), mask);
2203   BX_NEXT_INSTR(i);
2204 }
2205 
2206 // sad (sum of absolute differences)
2207 
VDBPSADBW_MASK_VdqHdqWdqIbR(bxInstruction_c * i)2208 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VDBPSADBW_MASK_VdqHdqWdqIbR(bxInstruction_c *i)
2209 {
2210   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst;
2211 
2212   Bit32u opmask = i->opmask() ? BX_READ_32BIT_OPMASK(i->opmask()) : (Bit32u) -1;
2213   unsigned len = i->getVL();
2214 
2215   for (unsigned n=0; n < len; n++) {
2216     BxPackedXmmRegister tmp;
2217     xmm_shufps(&tmp, &op2.vmm128(n), &op2.vmm128(n), i->Ib());
2218     xmm_dbpsadbw(&dst.vmm128(n), &op1.vmm128(n), &tmp);
2219   }
2220 
2221   avx512_write_regw_masked(i, &dst, len, opmask);
2222 
2223   BX_NEXT_INSTR(i);
2224 }
2225 
2226 // multishift (VBMI)
2227 
pmultishiftqb_scalar(Bit64u val_64,Bit64u control)2228 BX_CPP_INLINE Bit64u pmultishiftqb_scalar(Bit64u val_64, Bit64u control)
2229 {
2230   // use packed register as 64-bit value with convinient accessors
2231   BxPackedRegister result;
2232 
2233   for (unsigned n=0; n < 8; n++, control >>= 8) {
2234     unsigned ctrl = (control & 0x3f);
2235     Bit64u tmp = val_64;
2236     if (ctrl != 0)
2237         tmp = (val_64 << (64 - ctrl)) | (val_64 >> ctrl);
2238     result.ubyte(n) = tmp & 0xff;
2239   }
2240 
2241   return MMXUQ(result);
2242 }
2243 
VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c * i)2244 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c *i)
2245 {
2246   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2247   unsigned len = i->getVL();
2248 
2249   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2250     op1.vmm64u(n) = pmultishiftqb_scalar(op2.vmm64u(n), op1.vmm64u(n));
2251   }
2252 
2253   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
2254   BX_NEXT_INSTR(i);
2255 }
2256 
VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstruction_c * i)2257 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstruction_c *i)
2258 {
2259   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2260   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2261   unsigned len = i->getVL();
2262 
2263   for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2264     if (tmp_mask & 0x1)
2265       dst.vmm64u(n) = pmultishiftqb_scalar(op2.vmm64u(n), op1.vmm64u(n));
2266     else if (i->isZeroMasking())
2267       dst.vmm64u(n) = 0;
2268   }
2269 
2270   BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2271   BX_NEXT_INSTR(i);
2272 }
2273 
2274 // 52-bit integer FMA
2275 
pmadd52luq_scalar(Bit64u dst,Bit64u op1,Bit64u op2)2276 BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
2277 {
2278   op1 &= BX_CONST64(0x000fffffffffffff);
2279   op2 &= BX_CONST64(0x000fffffffffffff);
2280 
2281   return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff));
2282 }
2283 
pmadd52huq_scalar(Bit64u dst,Bit64u op1,Bit64u op2)2284 BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
2285 {
2286   op1 &= BX_CONST64(0x000fffffffffffff);
2287   op2 &= BX_CONST64(0x000fffffffffffff);
2288 
2289   Bit128u product_128;
2290   long_mul(&product_128, op1, op2);
2291 
2292   Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12);
2293 
2294   return dst + temp;
2295 }
2296 
VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c * i)2297 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i)
2298 {
2299   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2300   unsigned len = i->getVL();
2301 
2302   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2303     dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2304   }
2305 
2306   BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2307   BX_NEXT_INSTR(i);
2308 }
2309 
VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c * i)2310 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
2311 {
2312   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2313   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2314   unsigned len = i->getVL();
2315 
2316   for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2317     if (tmp_mask & 0x1)
2318       dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2319     else if (i->isZeroMasking())
2320       dst.vmm64u(n) = 0;
2321   }
2322 
2323   BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2324   BX_NEXT_INSTR(i);
2325 }
2326 
VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c * i)2327 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i)
2328 {
2329   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2330   unsigned len = i->getVL();
2331 
2332   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
2333     dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2334   }
2335 
2336   BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2337   BX_NEXT_INSTR(i);
2338 }
2339 
VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c * i)2340 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
2341 {
2342   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
2343   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
2344   unsigned len = i->getVL();
2345 
2346   for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
2347     if (tmp_mask & 0x1)
2348       dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
2349     else if (i->isZeroMasking())
2350       dst.vmm64u(n) = 0;
2351   }
2352 
2353   BX_WRITE_AVX_REGZ(i->dst(), dst, len);
2354   BX_NEXT_INSTR(i);
2355 }
2356 
VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c * i)2357 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c *i)
2358 {
2359   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2360   Bit64u mask1 = 0, mask2 = 0;
2361   unsigned len = i->getVL();
2362 
2363   for (unsigned n=0;n < DWORD_ELEMENTS(len); n++) {
2364     for (unsigned m=0;m < DWORD_ELEMENTS(len); m++) {
2365       if (op1.vmm32u(n) == op2.vmm32u(m)) {
2366         mask1 |= 1<<n;
2367         mask2 |= 1<<m;
2368       }
2369     }
2370   }
2371 
2372   unsigned mask_base = i->dst() & ~1;
2373   BX_WRITE_OPMASK(mask_base,   mask1);
2374   BX_WRITE_OPMASK(mask_base+1, mask2);
2375 
2376   BX_NEXT_INSTR(i);
2377 }
2378 
VP2INTERSECTQ_KGqHdqWdqR(bxInstruction_c * i)2379 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTQ_KGqHdqWdqR(bxInstruction_c *i)
2380 {
2381   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
2382   Bit64u mask1 = 0, mask2 = 0;
2383   unsigned len = i->getVL();
2384 
2385   for (unsigned n=0;n < QWORD_ELEMENTS(len); n++) {
2386     for (unsigned m=0;m < QWORD_ELEMENTS(len); m++) {
2387       if (op1.vmm64u(n) == op2.vmm64u(m)) {
2388         mask1 |= 1<<n;
2389         mask2 |= 1<<m;
2390       }
2391     }
2392   }
2393 
2394   unsigned mask_base = i->dst() & ~1;
2395   BX_WRITE_OPMASK(mask_base,   mask1);
2396   BX_WRITE_OPMASK(mask_base+1, mask2);
2397 
2398   BX_NEXT_INSTR(i);
2399 }
2400 
2401 #endif
2402