1 /////////////////////////////////////////////////////////////////////////
2 // $Id: xop.cc 13520 2018-05-27 19:09:59Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //   Copyright (c) 2011-2018 Stanislav Shwartsman
6 //          Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 //  This library is free software; you can redistribute it and/or
9 //  modify it under the terms of the GNU Lesser General Public
10 //  License as published by the Free Software Foundation; either
11 //  version 2 of the License, or (at your option) any later version.
12 //
13 //  This library is distributed in the hope that it will be useful,
14 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 //  Lesser General Public License for more details.
17 //
18 //  You should have received a copy of the GNU Lesser General Public
19 //  License along with this library; if not, write to the Free Software
20 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23 
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28 
29 #if BX_SUPPORT_AVX
30 
31 extern float_status_t mxcsr_to_softfloat_status_word(bx_mxcsr_t mxcsr);
32 
33 #include "simd_int.h"
34 #include "simd_compare.h"
35 
36 typedef void (*simd_compare_method)(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2);
37 
38 // comparison predicate for PCOMB
39 static simd_compare_method xop_compare8[8] = {
40   xmm_pcmpltb,
41   xmm_pcmpleb,
42   xmm_pcmpgtb,
43   xmm_pcmpgeb,
44   xmm_pcmpeqb,
45   xmm_pcmpneb,
46   xmm_pcmpfalse,
47   xmm_pcmptrue
48 };
49 
50 // comparison predicate for PCOMUB
51 static simd_compare_method xop_compare8u[8] = {
52   xmm_pcmpltub,
53   xmm_pcmpleub,
54   xmm_pcmpgtub,
55   xmm_pcmpgeub,
56   xmm_pcmpeqb,
57   xmm_pcmpneb,
58   xmm_pcmpfalse,
59   xmm_pcmptrue
60 };
61 
62 // comparison predicate for PCOMW
63 static simd_compare_method xop_compare16[8] = {
64   xmm_pcmpltw,
65   xmm_pcmplew,
66   xmm_pcmpgtw,
67   xmm_pcmpgew,
68   xmm_pcmpeqw,
69   xmm_pcmpnew,
70   xmm_pcmpfalse,
71   xmm_pcmptrue
72 };
73 
74 // comparison predicate for PCOMUW
75 static simd_compare_method xop_compare16u[8] = {
76   xmm_pcmpltuw,
77   xmm_pcmpleuw,
78   xmm_pcmpgtuw,
79   xmm_pcmpgeuw,
80   xmm_pcmpeqw,
81   xmm_pcmpnew,
82   xmm_pcmpfalse,
83   xmm_pcmptrue
84 };
85 
86 // comparison predicate for PCOMD
87 static simd_compare_method xop_compare32[8] = {
88   xmm_pcmpltd,
89   xmm_pcmpled,
90   xmm_pcmpgtd,
91   xmm_pcmpged,
92   xmm_pcmpeqd,
93   xmm_pcmpned,
94   xmm_pcmpfalse,
95   xmm_pcmptrue
96 };
97 
98 // comparison predicate for PCOMUD
99 static simd_compare_method xop_compare32u[8] = {
100   xmm_pcmpltud,
101   xmm_pcmpleud,
102   xmm_pcmpgtud,
103   xmm_pcmpgeud,
104   xmm_pcmpeqd,
105   xmm_pcmpned,
106   xmm_pcmpfalse,
107   xmm_pcmptrue
108 };
109 
110 // comparison predicate for PCOMQ
111 static simd_compare_method xop_compare64[8] = {
112   xmm_pcmpltq,
113   xmm_pcmpleq,
114   xmm_pcmpgtq,
115   xmm_pcmpgeq,
116   xmm_pcmpeqq,
117   xmm_pcmpneq,
118   xmm_pcmpfalse,
119   xmm_pcmptrue
120 };
121 
122 // comparison predicate for PCOMUQ
123 static simd_compare_method xop_compare64u[8] = {
124   xmm_pcmpltuq,
125   xmm_pcmpleuq,
126   xmm_pcmpgtuq,
127   xmm_pcmpgeuq,
128   xmm_pcmpeqq,
129   xmm_pcmpneq,
130   xmm_pcmpfalse,
131   xmm_pcmptrue
132 };
133 
134 typedef Bit8u (*vpperm_operation)(Bit8u byte);
135 
vpperm_bit_reverse(Bit8u v8)136 BX_CPP_INLINE Bit8u vpperm_bit_reverse(Bit8u v8)
137 {
138   return  (v8 >> 7) |
139          ((v8 >> 5) & 0x02) |
140          ((v8 >> 3) & 0x04) |
141          ((v8 >> 1) & 0x08) |
142          ((v8 << 1) & 0x10) |
143          ((v8 << 3) & 0x20) |
144          ((v8 << 5) & 0x40) |
145           (v8 << 7);
146 }
147 
vpperm_noop(Bit8u v8)148 BX_CPP_INLINE Bit8u vpperm_noop(Bit8u v8) { return v8; }
vpperm_invert(Bit8u v8)149 BX_CPP_INLINE Bit8u vpperm_invert(Bit8u v8) { return ~v8; }
vpperm_invert_bit_reverse(Bit8u v8)150 BX_CPP_INLINE Bit8u vpperm_invert_bit_reverse(Bit8u v8) { return vpperm_bit_reverse(~v8); }
vpperm_zeros(Bit8u v8)151 BX_CPP_INLINE Bit8u vpperm_zeros(Bit8u v8) { return 0; }
vpperm_ones(Bit8u v8)152 BX_CPP_INLINE Bit8u vpperm_ones(Bit8u v8) { return 0xff; }
vpperm_replicate_msb(Bit8u v8)153 BX_CPP_INLINE Bit8u vpperm_replicate_msb(Bit8u v8) { return (((Bit8s) v8) >> 7); }
vpperm_invert_replicate_msb(Bit8u v8)154 BX_CPP_INLINE Bit8u vpperm_invert_replicate_msb(Bit8u v8) { return vpperm_replicate_msb(~v8); }
155 
156 // logical operation for VPPERM
157 static vpperm_operation vpperm_op[8] = {
158   vpperm_noop,
159   vpperm_invert,
160   vpperm_bit_reverse,
161   vpperm_invert_bit_reverse,
162   vpperm_zeros,
163   vpperm_ones,
164   vpperm_replicate_msb,
165   vpperm_invert_replicate_msb
166 };
167 
VPCMOV_VdqHdqWdqVIb(bxInstruction_c * i)168 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMOV_VdqHdqWdqVIb(bxInstruction_c *i)
169 {
170   BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
171   BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
172   BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3());
173   unsigned len = i->getVL();
174 
175   for (unsigned n=0; n < len; n++) {
176     xmm_pselect(&op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n));
177   }
178 
179   BX_WRITE_YMM_REGZ_VLEN(i->dst(), op1, len);
180 
181   BX_NEXT_INSTR(i);
182 }
183 
VPPERM_VdqHdqWdqVIb(bxInstruction_c * i)184 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPPERM_VdqHdqWdqVIb(bxInstruction_c *i)
185 {
186   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
187   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
188   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()), dst;
189 
190   for (unsigned n=0;n<16;n++) {
191     unsigned control = op3.xmmubyte(n);
192 
193     if (control & 0x10)
194       dst.xmmubyte(n) = op1.xmmubyte(control & 0xf);
195     else
196       dst.xmmubyte(n) = op2.xmmubyte(control & 0xf);
197 
198     dst.xmmubyte(n) = vpperm_op[control >> 5](dst.xmmubyte(n));
199   }
200 
201   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), dst);
202 
203   BX_NEXT_INSTR(i);
204 }
205 
206 #define XOP_SHIFT_ROTATE(HANDLER, func)                                                     \
207   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i)               \
208   {                                                                                         \
209     BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); \
210                                                                                             \
211     (func)(&op1, &op2);                                                                     \
212                                                                                             \
213     BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);                                             \
214                                                                                             \
215     BX_NEXT_INSTR(i);                                                                       \
216   }
217 
218 XOP_SHIFT_ROTATE(VPSHAB_VdqWdqHdq, xmm_pshab);
219 XOP_SHIFT_ROTATE(VPSHAW_VdqWdqHdq, xmm_pshaw);
220 XOP_SHIFT_ROTATE(VPSHAD_VdqWdqHdq, xmm_pshad);
221 XOP_SHIFT_ROTATE(VPSHAQ_VdqWdqHdq, xmm_pshaq);
222 
223 XOP_SHIFT_ROTATE(VPSHLB_VdqWdqHdq, xmm_pshlb);
224 XOP_SHIFT_ROTATE(VPSHLW_VdqWdqHdq, xmm_pshlw);
225 XOP_SHIFT_ROTATE(VPSHLD_VdqWdqHdq, xmm_pshld);
226 XOP_SHIFT_ROTATE(VPSHLQ_VdqWdqHdq, xmm_pshlq);
227 
228 XOP_SHIFT_ROTATE(VPROTB_VdqWdqHdq, xmm_protb);
229 XOP_SHIFT_ROTATE(VPROTW_VdqWdqHdq, xmm_protw);
230 XOP_SHIFT_ROTATE(VPROTD_VdqWdqHdq, xmm_protd);
231 XOP_SHIFT_ROTATE(VPROTQ_VdqWdqHdq, xmm_protq);
232 
VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c * i)233 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
234 {
235   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
236   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
237   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
238 
239   for(unsigned n=0;n<8;n++) {
240     op1.xmm16s(n) = SaturateDwordSToWordS(((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n));
241   }
242 
243   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
244 
245   BX_NEXT_INSTR(i);
246 }
247 
VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c * i)248 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
249 {
250   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
251   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
252   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
253 
254   op1.xmm32s(0) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0));
255   op1.xmm32s(1) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1));
256   op1.xmm32s(2) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2));
257   op1.xmm32s(3) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3));
258 
259   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
260 
261   BX_NEXT_INSTR(i);
262 }
263 
add_saturate64(Bit64s a,Bit64s b)264 BX_CPP_INLINE Bit64s add_saturate64(Bit64s a, Bit64s b)
265 {
266   Bit64s r = a + b;
267   Bit64u overflow = GET_ADD_OVERFLOW(a, b, r, BX_CONST64(0x8000000000000000));
268   if (! overflow) return r;
269   // signed overflow detected, saturate
270   if (a > 0) overflow--;
271   return (Bit64s) overflow;
272 }
273 
VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c * i)274 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
275 {
276   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
277   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
278   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
279 
280   Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
281   Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
282 
283   op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
284   op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
285 
286   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
287 
288   BX_NEXT_INSTR(i);
289 }
290 
VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c * i)291 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
292 {
293   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
294   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
295   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
296 
297   for(unsigned n=0;n<4;n++) {
298     op1.xmm32s(n) = SaturateQwordSToDwordS(((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n));
299   }
300 
301   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
302 
303   BX_NEXT_INSTR(i);
304 }
305 
VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c * i)306 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
307 {
308   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
309   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
310   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
311 
312   Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
313   Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
314 
315   op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
316   op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
317 
318   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
319 
320   BX_NEXT_INSTR(i);
321 }
322 
VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c * i)323 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
324 {
325   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
326   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
327   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
328 
329   for(unsigned n=0;n<8;n++) {
330     op1.xmm16s(n) = ((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n);
331   }
332 
333   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
334 
335   BX_NEXT_INSTR(i);
336 }
337 
VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c * i)338 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
339 {
340   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
341   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
342   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
343 
344   op1.xmm32s(0) = ((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0);
345   op1.xmm32s(1) = ((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1);
346   op1.xmm32s(2) = ((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2);
347   op1.xmm32s(3) = ((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3);
348 
349   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
350 
351   BX_NEXT_INSTR(i);
352 }
353 
VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c * i)354 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
355 {
356   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
357   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
358   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
359 
360   Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
361   Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
362 
363   op1.xmm64s(0) = product1 + op3.xmm64s(0);
364   op1.xmm64s(1) = product2 + op3.xmm64s(1);
365 
366   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
367 
368   BX_NEXT_INSTR(i);
369 }
370 
VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c * i)371 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
372 {
373   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
374   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
375   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
376 
377   for(unsigned n=0;n<4;n++) {
378     op1.xmm32s(n) = ((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n);
379   }
380 
381   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
382 
383   BX_NEXT_INSTR(i);
384 }
385 
VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c * i)386 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
387 {
388   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
389   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
390   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
391 
392   Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
393   Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
394 
395   op1.xmm64s(0) = product1 + op3.xmm64s(0);
396   op1.xmm64s(1) = product2 + op3.xmm64s(1);
397 
398   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
399 
400   BX_NEXT_INSTR(i);
401 }
402 
VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c * i)403 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
404 {
405   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
406   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
407   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
408 
409   Bit32s product[8];
410 
411   for(unsigned n=0;n < 8;n++)
412     product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
413 
414   op1.xmm32s(0) = SaturateQwordSToDwordS((Bit64s) product[0] + (Bit64s) product[1] + (Bit64s) op3.xmm32s(0));
415   op1.xmm32s(1) = SaturateQwordSToDwordS((Bit64s) product[2] + (Bit64s) product[3] + (Bit64s) op3.xmm32s(1));
416   op1.xmm32s(2) = SaturateQwordSToDwordS((Bit64s) product[4] + (Bit64s) product[5] + (Bit64s) op3.xmm32s(2));
417   op1.xmm32s(3) = SaturateQwordSToDwordS((Bit64s) product[6] + (Bit64s) product[7] + (Bit64s) op3.xmm32s(3));
418 
419   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
420 
421   BX_NEXT_INSTR(i);
422 }
423 
VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c * i)424 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
425 {
426   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
427   BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
428   BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
429 
430   Bit32s product[8];
431 
432   for(unsigned n=0;n < 8;n++)
433     product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
434 
435   op1.xmm32s(0) = product[0] + product[1] + op3.xmm32s(0);
436   op1.xmm32s(1) = product[2] + product[3] + op3.xmm32s(1);
437   op1.xmm32s(2) = product[4] + product[5] + op3.xmm32s(2);
438   op1.xmm32s(3) = product[6] + product[7] + op3.xmm32s(3);
439 
440   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
441 
442   BX_NEXT_INSTR(i);
443 }
444 
VPROTB_VdqWdqIbR(bxInstruction_c * i)445 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTB_VdqWdqIbR(bxInstruction_c *i)
446 {
447   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
448   int count = i->Ib();
449 
450   if (count > 0) {
451     // rotate left
452     xmm_prolb(&op,  count);
453   }
454   else if (count < 0) {
455     // rotate right
456     xmm_prorb(&op, -count);
457   }
458 
459   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
460 
461   BX_NEXT_INSTR(i);
462 }
463 
VPROTW_VdqWdqIbR(bxInstruction_c * i)464 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTW_VdqWdqIbR(bxInstruction_c *i)
465 {
466   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
467   int count = i->Ib();
468 
469   if (count > 0) {
470     // rotate left
471     xmm_prolw(&op,  count);
472   }
473   else if (count < 0) {
474     // rotate right
475     xmm_prorw(&op, -count);
476   }
477 
478   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
479 
480   BX_NEXT_INSTR(i);
481 }
482 
VPROTD_VdqWdqIbR(bxInstruction_c * i)483 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTD_VdqWdqIbR(bxInstruction_c *i)
484 {
485   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
486   int count = i->Ib();
487 
488   if (count > 0) {
489     // rotate left
490     xmm_prold(&op,  count);
491   }
492   else if (count < 0) {
493     // rotate right
494     xmm_prord(&op, -count);
495   }
496 
497   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
498 
499   BX_NEXT_INSTR(i);
500 }
501 
VPROTQ_VdqWdqIbR(bxInstruction_c * i)502 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTQ_VdqWdqIbR(bxInstruction_c *i)
503 {
504   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
505   int count = i->Ib();
506 
507   if (count > 0) {
508     // rotate left
509     xmm_prolq(&op,  count);
510   }
511   else if (count < 0) {
512     // rotate right
513     xmm_prorq(&op, -count);
514   }
515 
516   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
517 
518   BX_NEXT_INSTR(i);
519 }
520 
VPCOMB_VdqHdqWdqIbR(bxInstruction_c * i)521 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMB_VdqHdqWdqIbR(bxInstruction_c *i)
522 {
523   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
524 
525   xop_compare8[i->Ib() & 7](&op1, &op2);
526 
527   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
528 
529   BX_NEXT_INSTR(i);
530 }
531 
VPCOMW_VdqHdqWdqIbR(bxInstruction_c * i)532 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMW_VdqHdqWdqIbR(bxInstruction_c *i)
533 {
534   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
535 
536   xop_compare16[i->Ib() & 7](&op1, &op2);
537 
538   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
539 
540   BX_NEXT_INSTR(i);
541 }
542 
VPCOMD_VdqHdqWdqIbR(bxInstruction_c * i)543 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMD_VdqHdqWdqIbR(bxInstruction_c *i)
544 {
545   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
546 
547   xop_compare32[i->Ib() & 7](&op1, &op2);
548 
549   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
550 
551   BX_NEXT_INSTR(i);
552 }
553 
VPCOMQ_VdqHdqWdqIbR(bxInstruction_c * i)554 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMQ_VdqHdqWdqIbR(bxInstruction_c *i)
555 {
556   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
557 
558   xop_compare64[i->Ib() & 7](&op1, &op2);
559 
560   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
561 
562   BX_NEXT_INSTR(i);
563 }
564 
VPCOMUB_VdqHdqWdqIbR(bxInstruction_c * i)565 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUB_VdqHdqWdqIbR(bxInstruction_c *i)
566 {
567   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
568 
569   xop_compare8u[i->Ib() & 7](&op1, &op2);
570 
571   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
572 
573   BX_NEXT_INSTR(i);
574 }
575 
VPCOMUW_VdqHdqWdqIbR(bxInstruction_c * i)576 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUW_VdqHdqWdqIbR(bxInstruction_c *i)
577 {
578   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
579 
580   xop_compare16u[i->Ib() & 7](&op1, &op2);
581 
582   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
583 
584   BX_NEXT_INSTR(i);
585 }
586 
VPCOMUD_VdqHdqWdqIbR(bxInstruction_c * i)587 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUD_VdqHdqWdqIbR(bxInstruction_c *i)
588 {
589   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
590 
591   xop_compare32u[i->Ib() & 7](&op1, &op2);
592 
593   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
594 
595   BX_NEXT_INSTR(i);
596 }
597 
VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c * i)598 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c *i)
599 {
600   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
601 
602   xop_compare64u[i->Ib() & 7](&op1, &op2);
603 
604   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
605 
606   BX_NEXT_INSTR(i);
607 }
608 
VFRCZPS_VpsWpsR(bxInstruction_c * i)609 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPS_VpsWpsR(bxInstruction_c *i)
610 {
611   BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
612   unsigned len = i->getVL();
613 
614   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
615 
616   for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
617     op.ymm32u(n) = float32_frc(op.ymm32u(n), status);
618   }
619 
620   check_exceptionsSSE(get_exception_flags(status));
621   BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
622 
623   BX_NEXT_INSTR(i);
624 }
625 
VFRCZPD_VpdWpdR(bxInstruction_c * i)626 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPD_VpdWpdR(bxInstruction_c *i)
627 {
628   BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
629   unsigned len = i->getVL();
630 
631   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
632 
633   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
634     op.ymm64u(n) = float64_frc(op.ymm64u(n), status);
635   }
636 
637   check_exceptionsSSE(get_exception_flags(status));
638 
639   BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
640 
641   BX_NEXT_INSTR(i);
642 }
643 
VFRCZSS_VssWssR(bxInstruction_c * i)644 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSS_VssWssR(bxInstruction_c *i)
645 {
646   float32 op = BX_READ_XMM_REG_LO_DWORD(i->src());
647   BxPackedXmmRegister r;
648 
649   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
650 
651   r.xmm64u(0) = (Bit64u) float32_frc(op, status);
652   r.xmm64u(1) = 0;
653 
654   check_exceptionsSSE(get_exception_flags(status));
655   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
656 
657   BX_NEXT_INSTR(i);
658 }
659 
VFRCZSD_VsdWsdR(bxInstruction_c * i)660 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSD_VsdWsdR(bxInstruction_c *i)
661 {
662   float64 op = BX_READ_XMM_REG_LO_QWORD(i->src());
663   BxPackedXmmRegister r;
664 
665   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
666 
667   r.xmm64u(0) = float64_frc(op, status);
668   r.xmm64u(1) = 0;
669 
670   check_exceptionsSSE(get_exception_flags(status));
671   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
672 
673   BX_NEXT_INSTR(i);
674 }
675 
VPHADDBW_VdqWdqR(bxInstruction_c * i)676 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBW_VdqWdqR(bxInstruction_c *i)
677 {
678   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
679 
680   op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) + (Bit16s) op.xmmsbyte(0x1);
681   op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) + (Bit16s) op.xmmsbyte(0x3);
682   op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) + (Bit16s) op.xmmsbyte(0x5);
683   op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) + (Bit16s) op.xmmsbyte(0x7);
684   op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) + (Bit16s) op.xmmsbyte(0x9);
685   op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) + (Bit16s) op.xmmsbyte(0xB);
686   op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) + (Bit16s) op.xmmsbyte(0xD);
687   op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) + (Bit16s) op.xmmsbyte(0xF);
688 
689   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
690 
691   BX_NEXT_INSTR(i);
692 }
693 
VPHADDBD_VdqWdqR(bxInstruction_c * i)694 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBD_VdqWdqR(bxInstruction_c *i)
695 {
696   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
697 
698   op.xmm32s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
699                  (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3);
700   op.xmm32s(1) = (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
701                  (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
702   op.xmm32s(2) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
703                  (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB);
704   op.xmm32s(3) = (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
705                  (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
706 
707   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
708 
709   BX_NEXT_INSTR(i);
710 }
711 
VPHADDBQ_VdqWdqR(bxInstruction_c * i)712 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBQ_VdqWdqR(bxInstruction_c *i)
713 {
714   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
715 
716   op.xmm64s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
717                  (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3) +
718                  (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
719                  (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
720   op.xmm64s(1) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
721                  (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB) +
722                  (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
723                  (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
724 
725   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
726 
727   BX_NEXT_INSTR(i);
728 }
729 
VPHADDWD_VdqWdqR(bxInstruction_c * i)730 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWD_VdqWdqR(bxInstruction_c *i)
731 {
732   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
733 
734   op.xmm32s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1);
735   op.xmm32s(1) = (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
736   op.xmm32s(2) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5);
737   op.xmm32s(3) = (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
738 
739   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
740 
741   BX_NEXT_INSTR(i);
742 }
743 
VPHADDWQ_VdqWdqR(bxInstruction_c * i)744 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWQ_VdqWdqR(bxInstruction_c *i)
745 {
746   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
747 
748   op.xmm64s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1) +
749                  (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
750   op.xmm64s(1) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5) +
751                  (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
752 
753   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
754 
755   BX_NEXT_INSTR(i);
756 }
757 
VPHADDDQ_VdqWdqR(bxInstruction_c * i)758 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDDQ_VdqWdqR(bxInstruction_c *i)
759 {
760   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
761 
762   op.xmm64s(0) = (Bit64s) op.xmm32s(0) + (Bit64s) op.xmm32s(1);
763   op.xmm64s(1) = (Bit64s) op.xmm32s(2) + (Bit64s) op.xmm32s(3);
764 
765   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
766 
767   BX_NEXT_INSTR(i);
768 }
769 
VPHADDUBW_VdqWdqR(bxInstruction_c * i)770 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBW_VdqWdqR(bxInstruction_c *i)
771 {
772   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
773 
774   op.xmm16u(0) = (Bit16u) op.xmmubyte(0x0) + (Bit16u) op.xmmubyte(0x1);
775   op.xmm16u(1) = (Bit16u) op.xmmubyte(0x2) + (Bit16u) op.xmmubyte(0x3);
776   op.xmm16u(2) = (Bit16u) op.xmmubyte(0x4) + (Bit16u) op.xmmubyte(0x5);
777   op.xmm16u(3) = (Bit16u) op.xmmubyte(0x6) + (Bit16u) op.xmmubyte(0x7);
778   op.xmm16u(4) = (Bit16u) op.xmmubyte(0x8) + (Bit16u) op.xmmubyte(0x9);
779   op.xmm16u(5) = (Bit16u) op.xmmubyte(0xA) + (Bit16u) op.xmmubyte(0xB);
780   op.xmm16u(6) = (Bit16u) op.xmmubyte(0xC) + (Bit16u) op.xmmubyte(0xD);
781   op.xmm16u(7) = (Bit16u) op.xmmubyte(0xE) + (Bit16u) op.xmmubyte(0xF);
782 
783   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
784 
785   BX_NEXT_INSTR(i);
786 }
787 
VPHADDUBD_VdqWdqR(bxInstruction_c * i)788 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBD_VdqWdqR(bxInstruction_c *i)
789 {
790   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
791 
792   op.xmm32u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32s) op.xmmubyte(0x1) +
793                  (Bit32u) op.xmmubyte(0x2) + (Bit32s) op.xmmubyte(0x3);
794   op.xmm32u(1) = (Bit32u) op.xmmubyte(0x4) + (Bit32s) op.xmmubyte(0x5) +
795                  (Bit32u) op.xmmubyte(0x6) + (Bit32s) op.xmmubyte(0x7);
796   op.xmm32u(2) = (Bit32u) op.xmmubyte(0x8) + (Bit32s) op.xmmubyte(0x9) +
797                  (Bit32u) op.xmmubyte(0xA) + (Bit32s) op.xmmubyte(0xB);
798   op.xmm32u(3) = (Bit32u) op.xmmubyte(0xC) + (Bit32s) op.xmmubyte(0xD) +
799                  (Bit32u) op.xmmubyte(0xE) + (Bit32s) op.xmmubyte(0xF);
800 
801   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
802 
803   BX_NEXT_INSTR(i);
804 }
805 
VPHADDUBQ_VdqWdqR(bxInstruction_c * i)806 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBQ_VdqWdqR(bxInstruction_c *i)
807 {
808   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
809 
810   op.xmm64u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32u) op.xmmubyte(0x1) +
811                  (Bit32u) op.xmmubyte(0x2) + (Bit32u) op.xmmubyte(0x3) +
812                  (Bit32u) op.xmmubyte(0x4) + (Bit32u) op.xmmubyte(0x5) +
813                  (Bit32u) op.xmmubyte(0x6) + (Bit32u) op.xmmubyte(0x7);
814   op.xmm64u(1) = (Bit32u) op.xmmubyte(0x8) + (Bit32u) op.xmmubyte(0x9) +
815                  (Bit32u) op.xmmubyte(0xA) + (Bit32u) op.xmmubyte(0xB) +
816                  (Bit32u) op.xmmubyte(0xC) + (Bit32u) op.xmmubyte(0xD) +
817                  (Bit32u) op.xmmubyte(0xE) + (Bit32u) op.xmmubyte(0xF);
818 
819   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
820 
821   BX_NEXT_INSTR(i);
822 }
823 
VPHADDUWD_VdqWdqR(bxInstruction_c * i)824 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWD_VdqWdqR(bxInstruction_c *i)
825 {
826   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
827 
828   op.xmm32u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1);
829   op.xmm32u(1) = (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
830   op.xmm32u(2) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5);
831   op.xmm32u(3) = (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
832 
833   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
834 
835   BX_NEXT_INSTR(i);
836 }
837 
VPHADDUWQ_VdqWdqR(bxInstruction_c * i)838 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWQ_VdqWdqR(bxInstruction_c *i)
839 {
840   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
841 
842   op.xmm64u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1) +
843                  (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
844   op.xmm64u(1) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5) +
845                  (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
846 
847   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
848 
849   BX_NEXT_INSTR(i);
850 }
851 
VPHADDUDQ_VdqWdqR(bxInstruction_c * i)852 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUDQ_VdqWdqR(bxInstruction_c *i)
853 {
854   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
855 
856   op.xmm64u(0) = (Bit64u) op.xmm32u(0) + (Bit64u) op.xmm32u(1);
857   op.xmm64u(1) = (Bit64u) op.xmm32u(2) + (Bit64u) op.xmm32u(3);
858 
859   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
860 
861   BX_NEXT_INSTR(i);
862 }
863 
VPHSUBBW_VdqWdqR(bxInstruction_c * i)864 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBBW_VdqWdqR(bxInstruction_c *i)
865 {
866   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
867 
868   op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) - (Bit16s) op.xmmsbyte(0x1);
869   op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) - (Bit16s) op.xmmsbyte(0x3);
870   op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) - (Bit16s) op.xmmsbyte(0x5);
871   op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) - (Bit16s) op.xmmsbyte(0x7);
872   op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) - (Bit16s) op.xmmsbyte(0x9);
873   op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) - (Bit16s) op.xmmsbyte(0xB);
874   op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) - (Bit16s) op.xmmsbyte(0xD);
875   op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) - (Bit16s) op.xmmsbyte(0xF);
876 
877   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
878 
879   BX_NEXT_INSTR(i);
880 }
881 
VPHSUBWD_VdqWdqR(bxInstruction_c * i)882 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBWD_VdqWdqR(bxInstruction_c *i)
883 {
884   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
885 
886   op.xmm32s(0) = (Bit32s) op.xmm16s(0) - (Bit32s) op.xmm16s(1);
887   op.xmm32s(1) = (Bit32s) op.xmm16s(2) - (Bit32s) op.xmm16s(3);
888   op.xmm32s(2) = (Bit32s) op.xmm16s(4) - (Bit32s) op.xmm16s(5);
889   op.xmm32s(3) = (Bit32s) op.xmm16s(6) - (Bit32s) op.xmm16s(7);
890 
891   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
892 
893   BX_NEXT_INSTR(i);
894 }
895 
VPHSUBDQ_VdqWdqR(bxInstruction_c * i)896 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBDQ_VdqWdqR(bxInstruction_c *i)
897 {
898   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
899 
900   op.xmm64s(0) = (Bit64s) op.xmm32s(0) - (Bit64s) op.xmm32s(1);
901   op.xmm64s(1) = (Bit64s) op.xmm32s(2) - (Bit64s) op.xmm32s(3);
902 
903   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
904 
905   BX_NEXT_INSTR(i);
906 }
907 
VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c * i)908 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c *i)
909 {
910   BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
911   BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
912   BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
913   unsigned len = i->getVL();
914 
915   result.clear();
916 
917   for (unsigned n=0; n < len; n++) {
918     xmm_permil2ps(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
919   }
920 
921   BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
922 
923   BX_NEXT_INSTR(i);
924 }
925 
VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c * i)926 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c *i)
927 {
928   BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
929   BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
930   BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
931   unsigned len = i->getVL();
932 
933   result.clear();
934 
935   for (unsigned n=0; n < len; n++) {
936     xmm_permil2pd(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
937   }
938 
939   BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
940 
941   BX_NEXT_INSTR(i);
942 }
943 
944 #endif
945