1 /////////////////////////////////////////////////////////////////////////
2 // $Id: xop.cc 13520 2018-05-27 19:09:59Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2011-2018 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28
29 #if BX_SUPPORT_AVX
30
31 extern float_status_t mxcsr_to_softfloat_status_word(bx_mxcsr_t mxcsr);
32
33 #include "simd_int.h"
34 #include "simd_compare.h"
35
36 typedef void (*simd_compare_method)(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2);
37
38 // comparison predicate for PCOMB
39 static simd_compare_method xop_compare8[8] = {
40 xmm_pcmpltb,
41 xmm_pcmpleb,
42 xmm_pcmpgtb,
43 xmm_pcmpgeb,
44 xmm_pcmpeqb,
45 xmm_pcmpneb,
46 xmm_pcmpfalse,
47 xmm_pcmptrue
48 };
49
50 // comparison predicate for PCOMUB
51 static simd_compare_method xop_compare8u[8] = {
52 xmm_pcmpltub,
53 xmm_pcmpleub,
54 xmm_pcmpgtub,
55 xmm_pcmpgeub,
56 xmm_pcmpeqb,
57 xmm_pcmpneb,
58 xmm_pcmpfalse,
59 xmm_pcmptrue
60 };
61
62 // comparison predicate for PCOMW
63 static simd_compare_method xop_compare16[8] = {
64 xmm_pcmpltw,
65 xmm_pcmplew,
66 xmm_pcmpgtw,
67 xmm_pcmpgew,
68 xmm_pcmpeqw,
69 xmm_pcmpnew,
70 xmm_pcmpfalse,
71 xmm_pcmptrue
72 };
73
74 // comparison predicate for PCOMUW
75 static simd_compare_method xop_compare16u[8] = {
76 xmm_pcmpltuw,
77 xmm_pcmpleuw,
78 xmm_pcmpgtuw,
79 xmm_pcmpgeuw,
80 xmm_pcmpeqw,
81 xmm_pcmpnew,
82 xmm_pcmpfalse,
83 xmm_pcmptrue
84 };
85
86 // comparison predicate for PCOMD
87 static simd_compare_method xop_compare32[8] = {
88 xmm_pcmpltd,
89 xmm_pcmpled,
90 xmm_pcmpgtd,
91 xmm_pcmpged,
92 xmm_pcmpeqd,
93 xmm_pcmpned,
94 xmm_pcmpfalse,
95 xmm_pcmptrue
96 };
97
98 // comparison predicate for PCOMUD
99 static simd_compare_method xop_compare32u[8] = {
100 xmm_pcmpltud,
101 xmm_pcmpleud,
102 xmm_pcmpgtud,
103 xmm_pcmpgeud,
104 xmm_pcmpeqd,
105 xmm_pcmpned,
106 xmm_pcmpfalse,
107 xmm_pcmptrue
108 };
109
110 // comparison predicate for PCOMQ
111 static simd_compare_method xop_compare64[8] = {
112 xmm_pcmpltq,
113 xmm_pcmpleq,
114 xmm_pcmpgtq,
115 xmm_pcmpgeq,
116 xmm_pcmpeqq,
117 xmm_pcmpneq,
118 xmm_pcmpfalse,
119 xmm_pcmptrue
120 };
121
122 // comparison predicate for PCOMUQ
123 static simd_compare_method xop_compare64u[8] = {
124 xmm_pcmpltuq,
125 xmm_pcmpleuq,
126 xmm_pcmpgtuq,
127 xmm_pcmpgeuq,
128 xmm_pcmpeqq,
129 xmm_pcmpneq,
130 xmm_pcmpfalse,
131 xmm_pcmptrue
132 };
133
134 typedef Bit8u (*vpperm_operation)(Bit8u byte);
135
vpperm_bit_reverse(Bit8u v8)136 BX_CPP_INLINE Bit8u vpperm_bit_reverse(Bit8u v8)
137 {
138 return (v8 >> 7) |
139 ((v8 >> 5) & 0x02) |
140 ((v8 >> 3) & 0x04) |
141 ((v8 >> 1) & 0x08) |
142 ((v8 << 1) & 0x10) |
143 ((v8 << 3) & 0x20) |
144 ((v8 << 5) & 0x40) |
145 (v8 << 7);
146 }
147
vpperm_noop(Bit8u v8)148 BX_CPP_INLINE Bit8u vpperm_noop(Bit8u v8) { return v8; }
vpperm_invert(Bit8u v8)149 BX_CPP_INLINE Bit8u vpperm_invert(Bit8u v8) { return ~v8; }
vpperm_invert_bit_reverse(Bit8u v8)150 BX_CPP_INLINE Bit8u vpperm_invert_bit_reverse(Bit8u v8) { return vpperm_bit_reverse(~v8); }
vpperm_zeros(Bit8u v8)151 BX_CPP_INLINE Bit8u vpperm_zeros(Bit8u v8) { return 0; }
vpperm_ones(Bit8u v8)152 BX_CPP_INLINE Bit8u vpperm_ones(Bit8u v8) { return 0xff; }
vpperm_replicate_msb(Bit8u v8)153 BX_CPP_INLINE Bit8u vpperm_replicate_msb(Bit8u v8) { return (((Bit8s) v8) >> 7); }
vpperm_invert_replicate_msb(Bit8u v8)154 BX_CPP_INLINE Bit8u vpperm_invert_replicate_msb(Bit8u v8) { return vpperm_replicate_msb(~v8); }
155
156 // logical operation for VPPERM
157 static vpperm_operation vpperm_op[8] = {
158 vpperm_noop,
159 vpperm_invert,
160 vpperm_bit_reverse,
161 vpperm_invert_bit_reverse,
162 vpperm_zeros,
163 vpperm_ones,
164 vpperm_replicate_msb,
165 vpperm_invert_replicate_msb
166 };
167
VPCMOV_VdqHdqWdqVIb(bxInstruction_c * i)168 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMOV_VdqHdqWdqVIb(bxInstruction_c *i)
169 {
170 BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
171 BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
172 BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3());
173 unsigned len = i->getVL();
174
175 for (unsigned n=0; n < len; n++) {
176 xmm_pselect(&op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n));
177 }
178
179 BX_WRITE_YMM_REGZ_VLEN(i->dst(), op1, len);
180
181 BX_NEXT_INSTR(i);
182 }
183
VPPERM_VdqHdqWdqVIb(bxInstruction_c * i)184 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPPERM_VdqHdqWdqVIb(bxInstruction_c *i)
185 {
186 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
187 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
188 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()), dst;
189
190 for (unsigned n=0;n<16;n++) {
191 unsigned control = op3.xmmubyte(n);
192
193 if (control & 0x10)
194 dst.xmmubyte(n) = op1.xmmubyte(control & 0xf);
195 else
196 dst.xmmubyte(n) = op2.xmmubyte(control & 0xf);
197
198 dst.xmmubyte(n) = vpperm_op[control >> 5](dst.xmmubyte(n));
199 }
200
201 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), dst);
202
203 BX_NEXT_INSTR(i);
204 }
205
206 #define XOP_SHIFT_ROTATE(HANDLER, func) \
207 void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
208 { \
209 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); \
210 \
211 (func)(&op1, &op2); \
212 \
213 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); \
214 \
215 BX_NEXT_INSTR(i); \
216 }
217
218 XOP_SHIFT_ROTATE(VPSHAB_VdqWdqHdq, xmm_pshab);
219 XOP_SHIFT_ROTATE(VPSHAW_VdqWdqHdq, xmm_pshaw);
220 XOP_SHIFT_ROTATE(VPSHAD_VdqWdqHdq, xmm_pshad);
221 XOP_SHIFT_ROTATE(VPSHAQ_VdqWdqHdq, xmm_pshaq);
222
223 XOP_SHIFT_ROTATE(VPSHLB_VdqWdqHdq, xmm_pshlb);
224 XOP_SHIFT_ROTATE(VPSHLW_VdqWdqHdq, xmm_pshlw);
225 XOP_SHIFT_ROTATE(VPSHLD_VdqWdqHdq, xmm_pshld);
226 XOP_SHIFT_ROTATE(VPSHLQ_VdqWdqHdq, xmm_pshlq);
227
228 XOP_SHIFT_ROTATE(VPROTB_VdqWdqHdq, xmm_protb);
229 XOP_SHIFT_ROTATE(VPROTW_VdqWdqHdq, xmm_protw);
230 XOP_SHIFT_ROTATE(VPROTD_VdqWdqHdq, xmm_protd);
231 XOP_SHIFT_ROTATE(VPROTQ_VdqWdqHdq, xmm_protq);
232
VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c * i)233 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
234 {
235 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
236 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
237 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
238
239 for(unsigned n=0;n<8;n++) {
240 op1.xmm16s(n) = SaturateDwordSToWordS(((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n));
241 }
242
243 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
244
245 BX_NEXT_INSTR(i);
246 }
247
VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c * i)248 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
249 {
250 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
251 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
252 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
253
254 op1.xmm32s(0) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0));
255 op1.xmm32s(1) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1));
256 op1.xmm32s(2) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2));
257 op1.xmm32s(3) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3));
258
259 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
260
261 BX_NEXT_INSTR(i);
262 }
263
add_saturate64(Bit64s a,Bit64s b)264 BX_CPP_INLINE Bit64s add_saturate64(Bit64s a, Bit64s b)
265 {
266 Bit64s r = a + b;
267 Bit64u overflow = GET_ADD_OVERFLOW(a, b, r, BX_CONST64(0x8000000000000000));
268 if (! overflow) return r;
269 // signed overflow detected, saturate
270 if (a > 0) overflow--;
271 return (Bit64s) overflow;
272 }
273
VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c * i)274 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
275 {
276 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
277 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
278 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
279
280 Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
281 Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
282
283 op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
284 op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
285
286 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
287
288 BX_NEXT_INSTR(i);
289 }
290
VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c * i)291 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
292 {
293 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
294 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
295 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
296
297 for(unsigned n=0;n<4;n++) {
298 op1.xmm32s(n) = SaturateQwordSToDwordS(((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n));
299 }
300
301 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
302
303 BX_NEXT_INSTR(i);
304 }
305
VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c * i)306 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
307 {
308 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
309 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
310 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
311
312 Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
313 Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
314
315 op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
316 op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
317
318 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
319
320 BX_NEXT_INSTR(i);
321 }
322
VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c * i)323 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
324 {
325 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
326 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
327 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
328
329 for(unsigned n=0;n<8;n++) {
330 op1.xmm16s(n) = ((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n);
331 }
332
333 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
334
335 BX_NEXT_INSTR(i);
336 }
337
VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c * i)338 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
339 {
340 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
341 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
342 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
343
344 op1.xmm32s(0) = ((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0);
345 op1.xmm32s(1) = ((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1);
346 op1.xmm32s(2) = ((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2);
347 op1.xmm32s(3) = ((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3);
348
349 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
350
351 BX_NEXT_INSTR(i);
352 }
353
VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c * i)354 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
355 {
356 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
357 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
358 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
359
360 Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
361 Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
362
363 op1.xmm64s(0) = product1 + op3.xmm64s(0);
364 op1.xmm64s(1) = product2 + op3.xmm64s(1);
365
366 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
367
368 BX_NEXT_INSTR(i);
369 }
370
VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c * i)371 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
372 {
373 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
374 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
375 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
376
377 for(unsigned n=0;n<4;n++) {
378 op1.xmm32s(n) = ((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n);
379 }
380
381 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
382
383 BX_NEXT_INSTR(i);
384 }
385
VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c * i)386 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
387 {
388 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
389 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
390 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
391
392 Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
393 Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
394
395 op1.xmm64s(0) = product1 + op3.xmm64s(0);
396 op1.xmm64s(1) = product2 + op3.xmm64s(1);
397
398 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
399
400 BX_NEXT_INSTR(i);
401 }
402
VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c * i)403 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
404 {
405 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
406 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
407 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
408
409 Bit32s product[8];
410
411 for(unsigned n=0;n < 8;n++)
412 product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
413
414 op1.xmm32s(0) = SaturateQwordSToDwordS((Bit64s) product[0] + (Bit64s) product[1] + (Bit64s) op3.xmm32s(0));
415 op1.xmm32s(1) = SaturateQwordSToDwordS((Bit64s) product[2] + (Bit64s) product[3] + (Bit64s) op3.xmm32s(1));
416 op1.xmm32s(2) = SaturateQwordSToDwordS((Bit64s) product[4] + (Bit64s) product[5] + (Bit64s) op3.xmm32s(2));
417 op1.xmm32s(3) = SaturateQwordSToDwordS((Bit64s) product[6] + (Bit64s) product[7] + (Bit64s) op3.xmm32s(3));
418
419 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
420
421 BX_NEXT_INSTR(i);
422 }
423
VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c * i)424 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
425 {
426 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
427 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
428 BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
429
430 Bit32s product[8];
431
432 for(unsigned n=0;n < 8;n++)
433 product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
434
435 op1.xmm32s(0) = product[0] + product[1] + op3.xmm32s(0);
436 op1.xmm32s(1) = product[2] + product[3] + op3.xmm32s(1);
437 op1.xmm32s(2) = product[4] + product[5] + op3.xmm32s(2);
438 op1.xmm32s(3) = product[6] + product[7] + op3.xmm32s(3);
439
440 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
441
442 BX_NEXT_INSTR(i);
443 }
444
VPROTB_VdqWdqIbR(bxInstruction_c * i)445 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTB_VdqWdqIbR(bxInstruction_c *i)
446 {
447 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
448 int count = i->Ib();
449
450 if (count > 0) {
451 // rotate left
452 xmm_prolb(&op, count);
453 }
454 else if (count < 0) {
455 // rotate right
456 xmm_prorb(&op, -count);
457 }
458
459 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
460
461 BX_NEXT_INSTR(i);
462 }
463
VPROTW_VdqWdqIbR(bxInstruction_c * i)464 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTW_VdqWdqIbR(bxInstruction_c *i)
465 {
466 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
467 int count = i->Ib();
468
469 if (count > 0) {
470 // rotate left
471 xmm_prolw(&op, count);
472 }
473 else if (count < 0) {
474 // rotate right
475 xmm_prorw(&op, -count);
476 }
477
478 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
479
480 BX_NEXT_INSTR(i);
481 }
482
VPROTD_VdqWdqIbR(bxInstruction_c * i)483 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTD_VdqWdqIbR(bxInstruction_c *i)
484 {
485 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
486 int count = i->Ib();
487
488 if (count > 0) {
489 // rotate left
490 xmm_prold(&op, count);
491 }
492 else if (count < 0) {
493 // rotate right
494 xmm_prord(&op, -count);
495 }
496
497 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
498
499 BX_NEXT_INSTR(i);
500 }
501
VPROTQ_VdqWdqIbR(bxInstruction_c * i)502 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTQ_VdqWdqIbR(bxInstruction_c *i)
503 {
504 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
505 int count = i->Ib();
506
507 if (count > 0) {
508 // rotate left
509 xmm_prolq(&op, count);
510 }
511 else if (count < 0) {
512 // rotate right
513 xmm_prorq(&op, -count);
514 }
515
516 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
517
518 BX_NEXT_INSTR(i);
519 }
520
VPCOMB_VdqHdqWdqIbR(bxInstruction_c * i)521 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMB_VdqHdqWdqIbR(bxInstruction_c *i)
522 {
523 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
524
525 xop_compare8[i->Ib() & 7](&op1, &op2);
526
527 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
528
529 BX_NEXT_INSTR(i);
530 }
531
VPCOMW_VdqHdqWdqIbR(bxInstruction_c * i)532 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMW_VdqHdqWdqIbR(bxInstruction_c *i)
533 {
534 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
535
536 xop_compare16[i->Ib() & 7](&op1, &op2);
537
538 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
539
540 BX_NEXT_INSTR(i);
541 }
542
VPCOMD_VdqHdqWdqIbR(bxInstruction_c * i)543 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMD_VdqHdqWdqIbR(bxInstruction_c *i)
544 {
545 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
546
547 xop_compare32[i->Ib() & 7](&op1, &op2);
548
549 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
550
551 BX_NEXT_INSTR(i);
552 }
553
VPCOMQ_VdqHdqWdqIbR(bxInstruction_c * i)554 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMQ_VdqHdqWdqIbR(bxInstruction_c *i)
555 {
556 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
557
558 xop_compare64[i->Ib() & 7](&op1, &op2);
559
560 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
561
562 BX_NEXT_INSTR(i);
563 }
564
VPCOMUB_VdqHdqWdqIbR(bxInstruction_c * i)565 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUB_VdqHdqWdqIbR(bxInstruction_c *i)
566 {
567 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
568
569 xop_compare8u[i->Ib() & 7](&op1, &op2);
570
571 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
572
573 BX_NEXT_INSTR(i);
574 }
575
VPCOMUW_VdqHdqWdqIbR(bxInstruction_c * i)576 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUW_VdqHdqWdqIbR(bxInstruction_c *i)
577 {
578 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
579
580 xop_compare16u[i->Ib() & 7](&op1, &op2);
581
582 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
583
584 BX_NEXT_INSTR(i);
585 }
586
VPCOMUD_VdqHdqWdqIbR(bxInstruction_c * i)587 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUD_VdqHdqWdqIbR(bxInstruction_c *i)
588 {
589 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
590
591 xop_compare32u[i->Ib() & 7](&op1, &op2);
592
593 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
594
595 BX_NEXT_INSTR(i);
596 }
597
VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c * i)598 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c *i)
599 {
600 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
601
602 xop_compare64u[i->Ib() & 7](&op1, &op2);
603
604 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
605
606 BX_NEXT_INSTR(i);
607 }
608
VFRCZPS_VpsWpsR(bxInstruction_c * i)609 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPS_VpsWpsR(bxInstruction_c *i)
610 {
611 BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
612 unsigned len = i->getVL();
613
614 float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
615
616 for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
617 op.ymm32u(n) = float32_frc(op.ymm32u(n), status);
618 }
619
620 check_exceptionsSSE(get_exception_flags(status));
621 BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
622
623 BX_NEXT_INSTR(i);
624 }
625
VFRCZPD_VpdWpdR(bxInstruction_c * i)626 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPD_VpdWpdR(bxInstruction_c *i)
627 {
628 BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
629 unsigned len = i->getVL();
630
631 float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
632
633 for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
634 op.ymm64u(n) = float64_frc(op.ymm64u(n), status);
635 }
636
637 check_exceptionsSSE(get_exception_flags(status));
638
639 BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
640
641 BX_NEXT_INSTR(i);
642 }
643
VFRCZSS_VssWssR(bxInstruction_c * i)644 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSS_VssWssR(bxInstruction_c *i)
645 {
646 float32 op = BX_READ_XMM_REG_LO_DWORD(i->src());
647 BxPackedXmmRegister r;
648
649 float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
650
651 r.xmm64u(0) = (Bit64u) float32_frc(op, status);
652 r.xmm64u(1) = 0;
653
654 check_exceptionsSSE(get_exception_flags(status));
655 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
656
657 BX_NEXT_INSTR(i);
658 }
659
VFRCZSD_VsdWsdR(bxInstruction_c * i)660 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSD_VsdWsdR(bxInstruction_c *i)
661 {
662 float64 op = BX_READ_XMM_REG_LO_QWORD(i->src());
663 BxPackedXmmRegister r;
664
665 float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
666
667 r.xmm64u(0) = float64_frc(op, status);
668 r.xmm64u(1) = 0;
669
670 check_exceptionsSSE(get_exception_flags(status));
671 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
672
673 BX_NEXT_INSTR(i);
674 }
675
VPHADDBW_VdqWdqR(bxInstruction_c * i)676 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBW_VdqWdqR(bxInstruction_c *i)
677 {
678 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
679
680 op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) + (Bit16s) op.xmmsbyte(0x1);
681 op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) + (Bit16s) op.xmmsbyte(0x3);
682 op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) + (Bit16s) op.xmmsbyte(0x5);
683 op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) + (Bit16s) op.xmmsbyte(0x7);
684 op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) + (Bit16s) op.xmmsbyte(0x9);
685 op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) + (Bit16s) op.xmmsbyte(0xB);
686 op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) + (Bit16s) op.xmmsbyte(0xD);
687 op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) + (Bit16s) op.xmmsbyte(0xF);
688
689 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
690
691 BX_NEXT_INSTR(i);
692 }
693
VPHADDBD_VdqWdqR(bxInstruction_c * i)694 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBD_VdqWdqR(bxInstruction_c *i)
695 {
696 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
697
698 op.xmm32s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
699 (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3);
700 op.xmm32s(1) = (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
701 (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
702 op.xmm32s(2) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
703 (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB);
704 op.xmm32s(3) = (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
705 (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
706
707 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
708
709 BX_NEXT_INSTR(i);
710 }
711
VPHADDBQ_VdqWdqR(bxInstruction_c * i)712 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBQ_VdqWdqR(bxInstruction_c *i)
713 {
714 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
715
716 op.xmm64s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
717 (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3) +
718 (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
719 (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
720 op.xmm64s(1) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
721 (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB) +
722 (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
723 (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
724
725 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
726
727 BX_NEXT_INSTR(i);
728 }
729
VPHADDWD_VdqWdqR(bxInstruction_c * i)730 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWD_VdqWdqR(bxInstruction_c *i)
731 {
732 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
733
734 op.xmm32s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1);
735 op.xmm32s(1) = (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
736 op.xmm32s(2) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5);
737 op.xmm32s(3) = (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
738
739 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
740
741 BX_NEXT_INSTR(i);
742 }
743
VPHADDWQ_VdqWdqR(bxInstruction_c * i)744 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWQ_VdqWdqR(bxInstruction_c *i)
745 {
746 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
747
748 op.xmm64s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1) +
749 (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
750 op.xmm64s(1) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5) +
751 (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
752
753 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
754
755 BX_NEXT_INSTR(i);
756 }
757
VPHADDDQ_VdqWdqR(bxInstruction_c * i)758 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDDQ_VdqWdqR(bxInstruction_c *i)
759 {
760 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
761
762 op.xmm64s(0) = (Bit64s) op.xmm32s(0) + (Bit64s) op.xmm32s(1);
763 op.xmm64s(1) = (Bit64s) op.xmm32s(2) + (Bit64s) op.xmm32s(3);
764
765 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
766
767 BX_NEXT_INSTR(i);
768 }
769
VPHADDUBW_VdqWdqR(bxInstruction_c * i)770 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBW_VdqWdqR(bxInstruction_c *i)
771 {
772 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
773
774 op.xmm16u(0) = (Bit16u) op.xmmubyte(0x0) + (Bit16u) op.xmmubyte(0x1);
775 op.xmm16u(1) = (Bit16u) op.xmmubyte(0x2) + (Bit16u) op.xmmubyte(0x3);
776 op.xmm16u(2) = (Bit16u) op.xmmubyte(0x4) + (Bit16u) op.xmmubyte(0x5);
777 op.xmm16u(3) = (Bit16u) op.xmmubyte(0x6) + (Bit16u) op.xmmubyte(0x7);
778 op.xmm16u(4) = (Bit16u) op.xmmubyte(0x8) + (Bit16u) op.xmmubyte(0x9);
779 op.xmm16u(5) = (Bit16u) op.xmmubyte(0xA) + (Bit16u) op.xmmubyte(0xB);
780 op.xmm16u(6) = (Bit16u) op.xmmubyte(0xC) + (Bit16u) op.xmmubyte(0xD);
781 op.xmm16u(7) = (Bit16u) op.xmmubyte(0xE) + (Bit16u) op.xmmubyte(0xF);
782
783 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
784
785 BX_NEXT_INSTR(i);
786 }
787
VPHADDUBD_VdqWdqR(bxInstruction_c * i)788 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBD_VdqWdqR(bxInstruction_c *i)
789 {
790 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
791
792 op.xmm32u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32s) op.xmmubyte(0x1) +
793 (Bit32u) op.xmmubyte(0x2) + (Bit32s) op.xmmubyte(0x3);
794 op.xmm32u(1) = (Bit32u) op.xmmubyte(0x4) + (Bit32s) op.xmmubyte(0x5) +
795 (Bit32u) op.xmmubyte(0x6) + (Bit32s) op.xmmubyte(0x7);
796 op.xmm32u(2) = (Bit32u) op.xmmubyte(0x8) + (Bit32s) op.xmmubyte(0x9) +
797 (Bit32u) op.xmmubyte(0xA) + (Bit32s) op.xmmubyte(0xB);
798 op.xmm32u(3) = (Bit32u) op.xmmubyte(0xC) + (Bit32s) op.xmmubyte(0xD) +
799 (Bit32u) op.xmmubyte(0xE) + (Bit32s) op.xmmubyte(0xF);
800
801 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
802
803 BX_NEXT_INSTR(i);
804 }
805
VPHADDUBQ_VdqWdqR(bxInstruction_c * i)806 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBQ_VdqWdqR(bxInstruction_c *i)
807 {
808 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
809
810 op.xmm64u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32u) op.xmmubyte(0x1) +
811 (Bit32u) op.xmmubyte(0x2) + (Bit32u) op.xmmubyte(0x3) +
812 (Bit32u) op.xmmubyte(0x4) + (Bit32u) op.xmmubyte(0x5) +
813 (Bit32u) op.xmmubyte(0x6) + (Bit32u) op.xmmubyte(0x7);
814 op.xmm64u(1) = (Bit32u) op.xmmubyte(0x8) + (Bit32u) op.xmmubyte(0x9) +
815 (Bit32u) op.xmmubyte(0xA) + (Bit32u) op.xmmubyte(0xB) +
816 (Bit32u) op.xmmubyte(0xC) + (Bit32u) op.xmmubyte(0xD) +
817 (Bit32u) op.xmmubyte(0xE) + (Bit32u) op.xmmubyte(0xF);
818
819 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
820
821 BX_NEXT_INSTR(i);
822 }
823
VPHADDUWD_VdqWdqR(bxInstruction_c * i)824 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWD_VdqWdqR(bxInstruction_c *i)
825 {
826 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
827
828 op.xmm32u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1);
829 op.xmm32u(1) = (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
830 op.xmm32u(2) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5);
831 op.xmm32u(3) = (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
832
833 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
834
835 BX_NEXT_INSTR(i);
836 }
837
VPHADDUWQ_VdqWdqR(bxInstruction_c * i)838 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWQ_VdqWdqR(bxInstruction_c *i)
839 {
840 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
841
842 op.xmm64u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1) +
843 (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
844 op.xmm64u(1) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5) +
845 (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
846
847 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
848
849 BX_NEXT_INSTR(i);
850 }
851
VPHADDUDQ_VdqWdqR(bxInstruction_c * i)852 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUDQ_VdqWdqR(bxInstruction_c *i)
853 {
854 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
855
856 op.xmm64u(0) = (Bit64u) op.xmm32u(0) + (Bit64u) op.xmm32u(1);
857 op.xmm64u(1) = (Bit64u) op.xmm32u(2) + (Bit64u) op.xmm32u(3);
858
859 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
860
861 BX_NEXT_INSTR(i);
862 }
863
VPHSUBBW_VdqWdqR(bxInstruction_c * i)864 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBBW_VdqWdqR(bxInstruction_c *i)
865 {
866 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
867
868 op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) - (Bit16s) op.xmmsbyte(0x1);
869 op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) - (Bit16s) op.xmmsbyte(0x3);
870 op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) - (Bit16s) op.xmmsbyte(0x5);
871 op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) - (Bit16s) op.xmmsbyte(0x7);
872 op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) - (Bit16s) op.xmmsbyte(0x9);
873 op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) - (Bit16s) op.xmmsbyte(0xB);
874 op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) - (Bit16s) op.xmmsbyte(0xD);
875 op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) - (Bit16s) op.xmmsbyte(0xF);
876
877 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
878
879 BX_NEXT_INSTR(i);
880 }
881
VPHSUBWD_VdqWdqR(bxInstruction_c * i)882 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBWD_VdqWdqR(bxInstruction_c *i)
883 {
884 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
885
886 op.xmm32s(0) = (Bit32s) op.xmm16s(0) - (Bit32s) op.xmm16s(1);
887 op.xmm32s(1) = (Bit32s) op.xmm16s(2) - (Bit32s) op.xmm16s(3);
888 op.xmm32s(2) = (Bit32s) op.xmm16s(4) - (Bit32s) op.xmm16s(5);
889 op.xmm32s(3) = (Bit32s) op.xmm16s(6) - (Bit32s) op.xmm16s(7);
890
891 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
892
893 BX_NEXT_INSTR(i);
894 }
895
VPHSUBDQ_VdqWdqR(bxInstruction_c * i)896 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBDQ_VdqWdqR(bxInstruction_c *i)
897 {
898 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
899
900 op.xmm64s(0) = (Bit64s) op.xmm32s(0) - (Bit64s) op.xmm32s(1);
901 op.xmm64s(1) = (Bit64s) op.xmm32s(2) - (Bit64s) op.xmm32s(3);
902
903 BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
904
905 BX_NEXT_INSTR(i);
906 }
907
VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c * i)908 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c *i)
909 {
910 BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
911 BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
912 BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
913 unsigned len = i->getVL();
914
915 result.clear();
916
917 for (unsigned n=0; n < len; n++) {
918 xmm_permil2ps(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
919 }
920
921 BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
922
923 BX_NEXT_INSTR(i);
924 }
925
VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c * i)926 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c *i)
927 {
928 BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
929 BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
930 BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
931 unsigned len = i->getVL();
932
933 result.clear();
934
935 for (unsigned n=0; n < len; n++) {
936 xmm_permil2pd(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
937 }
938
939 BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
940
941 BX_NEXT_INSTR(i);
942 }
943
944 #endif
945