1 /////////////////////////////////////////////////////////////////////////
2 // $Id: avx512_pfp.cc 13466 2018-02-16 07:57:32Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //   Copyright (c) 2013-2018 Stanislav Shwartsman
6 //          Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 //  This library is free software; you can redistribute it and/or
9 //  modify it under the terms of the GNU Lesser General Public
10 //  License as published by the Free Software Foundation; either
11 //  version 2 of the License, or (at your option) any later version.
12 //
13 //  This library is distributed in the hope that it will be useful,
14 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 //  Lesser General Public License for more details.
17 //
18 //  You should have received a copy of the GNU Lesser General Public
19 //  License along with this library; if not, write to the Free Software
20 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23 
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28 
29 #if BX_SUPPORT_EVEX
30 
31 extern float_status_t mxcsr_to_softfloat_status_word(bx_mxcsr_t mxcsr);
32 
33 #include "fpu/softfloat-compare.h"
34 #include "simd_int.h"
35 #include "simd_pfp.h"
36 
37 #define EVEX_OP_PACKED_SINGLE(HANDLER, func)                                                \
38   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i)                       \
39   {                                                                                         \
40     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
41     unsigned mask = BX_READ_16BIT_OPMASK(i->opmask());                                      \
42     unsigned len = i->getVL();                                                              \
43                                                                                             \
44     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);                          \
45     softfloat_status_word_rc_override(status, i);                                           \
46                                                                                             \
47     for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 4)                       \
48       (func)(&op1.vmm128(n), &op2.vmm128(n), status, tmp_mask);                             \
49                                                                                             \
50     check_exceptionsSSE(get_exception_flags(status));                                       \
51                                                                                             \
52     if (! i->isZeroMasking()) {                                                             \
53       for (unsigned n=0; n < len; n++, mask >>= 4)                                          \
54         xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), mask);              \
55                                                                                             \
56       BX_CLEAR_AVX_REGZ(i->dst(), len);                                                     \
57     }                                                                                       \
58     else {                                                                                  \
59       BX_WRITE_AVX_REGZ(i->dst(), op1, len);                                                \
60     }                                                                                       \
61                                                                                             \
62     BX_NEXT_INSTR(i);                                                                       \
63   }
64 
EVEX_OP_PACKED_SINGLE(VADDPS_MASK_VpsHpsWpsR,xmm_addps_mask)65 EVEX_OP_PACKED_SINGLE(VADDPS_MASK_VpsHpsWpsR, xmm_addps_mask)
66 EVEX_OP_PACKED_SINGLE(VSUBPS_MASK_VpsHpsWpsR, xmm_subps_mask)
67 EVEX_OP_PACKED_SINGLE(VMULPS_MASK_VpsHpsWpsR, xmm_mulps_mask)
68 EVEX_OP_PACKED_SINGLE(VDIVPS_MASK_VpsHpsWpsR, xmm_divps_mask)
69 EVEX_OP_PACKED_SINGLE(VMAXPS_MASK_VpsHpsWpsR, xmm_maxps_mask)
70 EVEX_OP_PACKED_SINGLE(VMINPS_MASK_VpsHpsWpsR, xmm_minps_mask)
71 EVEX_OP_PACKED_SINGLE(VSCALEFPS_MASK_VpsHpsWpsR, xmm_scalefps_mask)
72 
73 #define EVEX_OP_PACKED_DOUBLE(HANDLER, func)                                                \
74   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i)                       \
75   {                                                                                         \
76     BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); \
77     unsigned mask = BX_READ_8BIT_OPMASK(i->opmask());                                       \
78     unsigned len = i->getVL();                                                              \
79                                                                                             \
80     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);                          \
81     softfloat_status_word_rc_override(status, i);                                           \
82                                                                                             \
83     for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 2)                       \
84       (func)(&op1.vmm128(n), &op2.vmm128(n), status, tmp_mask);                             \
85                                                                                             \
86     check_exceptionsSSE(get_exception_flags(status));                                       \
87                                                                                             \
88     if (! i->isZeroMasking()) {                                                             \
89       for (unsigned n=0; n < len; n++, mask >>= 2)                                          \
90         xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), mask);              \
91                                                                                             \
92       BX_CLEAR_AVX_REGZ(i->dst(), len);                                                     \
93     }                                                                                       \
94     else {                                                                                  \
95       BX_WRITE_AVX_REGZ(i->dst(), op1, len);                                                \
96     }                                                                                       \
97                                                                                             \
98     BX_NEXT_INSTR(i);                                                                       \
99   }
100 
101 EVEX_OP_PACKED_DOUBLE(VADDPD_MASK_VpdHpdWpdR, xmm_addpd_mask)
102 EVEX_OP_PACKED_DOUBLE(VSUBPD_MASK_VpdHpdWpdR, xmm_subpd_mask)
103 EVEX_OP_PACKED_DOUBLE(VMULPD_MASK_VpdHpdWpdR, xmm_mulpd_mask)
104 EVEX_OP_PACKED_DOUBLE(VDIVPD_MASK_VpdHpdWpdR, xmm_divpd_mask)
105 EVEX_OP_PACKED_DOUBLE(VMAXPD_MASK_VpdHpdWpdR, xmm_maxpd_mask)
106 EVEX_OP_PACKED_DOUBLE(VMINPD_MASK_VpdHpdWpdR, xmm_minpd_mask)
107 EVEX_OP_PACKED_DOUBLE(VSCALEFPD_MASK_VpdHpdWpdR, xmm_scalefpd_mask)
108 
109 #define EVEX_OP_SCALAR_SINGLE(HANDLER, func)                                                \
110   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i)                       \
111   {                                                                                         \
112     BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());                                   \
113                                                                                             \
114     if (BX_SCALAR_ELEMENT_MASK(i->opmask())) {                                              \
115       float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());                                    \
116                                                                                             \
117       float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);                        \
118       softfloat_status_word_rc_override(status, i);                                         \
119       op1.xmm32u(0) = (func)(op1.xmm32u(0), op2, status);                                   \
120       check_exceptionsSSE(get_exception_flags(status));                                     \
121     }                                                                                       \
122     else {                                                                                  \
123       if (i->isZeroMasking())                                                               \
124         op1.xmm32u(0) = 0;                                                                  \
125       else                                                                                  \
126         op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());                                 \
127     }                                                                                       \
128                                                                                             \
129     BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);                                             \
130     BX_NEXT_INSTR(i);                                                                       \
131   }
132 
133 EVEX_OP_SCALAR_SINGLE(VADDSS_MASK_VssHpsWssR, float32_add)
134 EVEX_OP_SCALAR_SINGLE(VSUBSS_MASK_VssHpsWssR, float32_sub)
135 EVEX_OP_SCALAR_SINGLE(VMULSS_MASK_VssHpsWssR, float32_mul)
136 EVEX_OP_SCALAR_SINGLE(VDIVSS_MASK_VssHpsWssR, float32_div)
137 EVEX_OP_SCALAR_SINGLE(VMINSS_MASK_VssHpsWssR, float32_min)
138 EVEX_OP_SCALAR_SINGLE(VMAXSS_MASK_VssHpsWssR, float32_max)
139 EVEX_OP_SCALAR_SINGLE(VSCALEFSS_MASK_VssHpsWssR, float32_scalef)
140 
141 #define EVEX_OP_SCALAR_DOUBLE(HANDLER, func)                                                \
142   void BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i)                       \
143   {                                                                                         \
144     BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());                                   \
145                                                                                             \
146     if (BX_SCALAR_ELEMENT_MASK(i->opmask())) {                                              \
147       float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());                                    \
148                                                                                             \
149       float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);                        \
150       softfloat_status_word_rc_override(status, i);                                         \
151       op1.xmm64u(0) = (func)(op1.xmm64u(0), op2, status);                                   \
152       check_exceptionsSSE(get_exception_flags(status));                                     \
153     }                                                                                       \
154     else {                                                                                  \
155       if (i->isZeroMasking())                                                               \
156         op1.xmm64u(0) = 0;                                                                  \
157       else                                                                                  \
158         op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());                                 \
159     }                                                                                       \
160                                                                                             \
161     BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);                                             \
162     BX_NEXT_INSTR(i);                                                                       \
163   }
164 
165 EVEX_OP_SCALAR_DOUBLE(VADDSD_MASK_VsdHpdWsdR, float64_add)
166 EVEX_OP_SCALAR_DOUBLE(VSUBSD_MASK_VsdHpdWsdR, float64_sub)
167 EVEX_OP_SCALAR_DOUBLE(VMULSD_MASK_VsdHpdWsdR, float64_mul)
168 EVEX_OP_SCALAR_DOUBLE(VDIVSD_MASK_VsdHpdWsdR, float64_div)
169 EVEX_OP_SCALAR_DOUBLE(VMINSD_MASK_VsdHpdWsdR, float64_min)
170 EVEX_OP_SCALAR_DOUBLE(VMAXSD_MASK_VsdHpdWsdR, float64_max)
171 EVEX_OP_SCALAR_DOUBLE(VSCALEFSD_MASK_VsdHpdWsdR, float64_scalef)
172 
173 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSQRTPS_MASK_VpsWpsR(bxInstruction_c *i)
174 {
175   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
176   unsigned mask = BX_READ_16BIT_OPMASK(i->opmask());
177   unsigned len = i->getVL();
178 
179   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
180   softfloat_status_word_rc_override(status, i);
181 
182   for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 4)
183     xmm_sqrtps_mask(&op.vmm128(n), status, tmp_mask);
184 
185   check_exceptionsSSE(get_exception_flags(status));
186 
187   if (! i->isZeroMasking()) {
188     for (unsigned n=0; n < len; n++, mask >>= 4)
189       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), mask);
190     BX_CLEAR_AVX_REGZ(i->dst(), len);
191   }
192   else {
193     BX_WRITE_AVX_REGZ(i->dst(), op, len);
194   }
195 
196   BX_NEXT_INSTR(i);
197 }
198 
VSQRTPD_MASK_VpdWpdR(bxInstruction_c * i)199 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSQRTPD_MASK_VpdWpdR(bxInstruction_c *i)
200 {
201   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
202   unsigned mask = BX_READ_8BIT_OPMASK(i->opmask());
203   unsigned len = i->getVL();
204 
205   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
206   softfloat_status_word_rc_override(status, i);
207 
208   for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 2)
209     xmm_sqrtpd_mask(&op.vmm128(n), status, tmp_mask);
210 
211   check_exceptionsSSE(get_exception_flags(status));
212 
213   if (! i->isZeroMasking()) {
214     for (unsigned n=0; n < len; n++, mask >>= 2)
215       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), mask);
216     BX_CLEAR_AVX_REGZ(i->dst(), len);
217   }
218   else {
219     BX_WRITE_AVX_REGZ(i->dst(), op, len);
220   }
221 
222   BX_NEXT_INSTR(i);
223 }
224 
VSQRTSS_MASK_VssHpsWssR(bxInstruction_c * i)225 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSQRTSS_MASK_VssHpsWssR(bxInstruction_c *i)
226 {
227   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
228 
229   if (BX_SCALAR_ELEMENT_MASK(i->opmask())) {
230     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
231 
232     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
233     softfloat_status_word_rc_override(status, i);
234     op1.xmm32u(0) = float32_sqrt(op2, status);
235     check_exceptionsSSE(get_exception_flags(status));
236   }
237   else {
238     if (i->isZeroMasking())
239       op1.xmm32u(0) = 0;
240     else
241       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
242   }
243 
244   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
245   BX_NEXT_INSTR(i);
246 }
247 
VSQRTSD_MASK_VsdHpdWsdR(bxInstruction_c * i)248 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSQRTSD_MASK_VsdHpdWsdR(bxInstruction_c *i)
249 {
250   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
251 
252   if (BX_SCALAR_ELEMENT_MASK(i->opmask())) {
253     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
254 
255     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
256     softfloat_status_word_rc_override(status, i);
257     op1.xmm64u(0) = float64_sqrt(op2, status);
258     check_exceptionsSSE(get_exception_flags(status));
259   }
260   else {
261     if (i->isZeroMasking())
262       op1.xmm64u(0) = 0;
263     else
264       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
265   }
266 
267   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
268   BX_NEXT_INSTR(i);
269 }
270 
271 // compare
272 
273 extern float32_compare_method avx_compare32[32];
274 extern float64_compare_method avx_compare64[32];
275 
VCMPPS_MASK_KGwHpsWpsIbR(bxInstruction_c * i)276 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCMPPS_MASK_KGwHpsWpsIbR(bxInstruction_c *i)
277 {
278   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
279   unsigned num_elements = DWORD_ELEMENTS(i->getVL());
280 
281   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
282   Bit32u result = 0;
283 
284   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
285   softfloat_status_word_rc_override(status, i);
286   int ib = i->Ib() & 0x1F;
287 
288   for (unsigned n=0, mask = 0x1; n < num_elements; n++, mask <<= 1) {
289     if (opmask & mask) {
290       if (avx_compare32[ib](op1.vmm32u(n), op2.vmm32u(n), status)) result |= mask;
291     }
292   }
293 
294   check_exceptionsSSE(get_exception_flags(status));
295   BX_WRITE_OPMASK(i->dst(), result);
296 
297   BX_NEXT_INSTR(i);
298 }
299 
VCMPPD_MASK_KGbHpdWpdIbR(bxInstruction_c * i)300 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCMPPD_MASK_KGbHpdWpdIbR(bxInstruction_c *i)
301 {
302   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
303   unsigned num_elements = QWORD_ELEMENTS(i->getVL());
304 
305   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
306   Bit32u result = 0;
307 
308   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
309   softfloat_status_word_rc_override(status, i);
310   int ib = i->Ib() & 0x1F;
311 
312   for (unsigned n=0, mask = 0x1; n < num_elements; n++, mask <<= 1) {
313     if (opmask & mask) {
314       if (avx_compare64[ib](op1.vmm64u(n), op2.vmm64u(n), status)) result |= mask;
315     }
316   }
317 
318   check_exceptionsSSE(get_exception_flags(status));
319   BX_WRITE_OPMASK(i->dst(), result);
320 
321   BX_NEXT_INSTR(i);
322 }
323 
VCMPSD_MASK_KGbHsdWsdIbR(bxInstruction_c * i)324 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCMPSD_MASK_KGbHsdWsdIbR(bxInstruction_c *i)
325 {
326   Bit32u result = 0;
327 
328   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
329     float64 op1 = BX_READ_XMM_REG_LO_QWORD(i->src1());
330     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
331 
332     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
333     softfloat_status_word_rc_override(status, i);
334     if (avx_compare64[i->Ib() & 0x1F](op1, op2, status)) result = 1;
335     check_exceptionsSSE(get_exception_flags(status));
336   }
337 
338   BX_WRITE_OPMASK(i->dst(), result);
339   BX_NEXT_INSTR(i);
340 }
341 
VCMPSS_MASK_KGbHssWssIbR(bxInstruction_c * i)342 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCMPSS_MASK_KGbHssWssIbR(bxInstruction_c *i)
343 {
344   Bit32u result = 0;
345 
346   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
347     float32 op1 = BX_READ_XMM_REG_LO_DWORD(i->src1());
348     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
349 
350     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
351     softfloat_status_word_rc_override(status, i);
352     if (avx_compare32[i->Ib() & 0x1F](op1, op2, status)) result = 1;
353     check_exceptionsSSE(get_exception_flags(status));
354   }
355 
356   BX_WRITE_OPMASK(i->dst(), result);
357   BX_NEXT_INSTR(i);
358 }
359 
360 // fixup
361 
362 enum {
363   BX_FIXUPIMM_QNAN_TOKEN = 0,
364   BX_FIXUPIMM_SNAN_TOKEN = 1,
365   BX_FIXUPIMM_ZERO_VALUE_TOKEN = 2,
366   BX_FIXUPIMM_POS_ONE_VALUE_TOKEN = 3,
367   BX_FIXUPIMM_NEG_INF_TOKEN = 4,
368   BX_FIXUPIMM_POS_INF_TOKEN = 5,
369   BX_FIXUPIMM_NEG_VALUE_TOKEN = 6,
370   BX_FIXUPIMM_POS_VALUE_TOKEN = 7
371 };
372 
373 #include "fpu/softfloat-specialize.h"
374 
375 const float32 float32_value_90      = 0x42b40000;
376 const float32 float32_pi_half       = 0x3fc90fdb;
377 const float32 float32_positive_half = 0x3f000000;
378 
379 const float64 float64_value_90      = BX_CONST64(0x4056800000000000);
380 const float64 float64_pi_half       = BX_CONST64(0x3ff921fb54442d18);
381 const float64 float64_positive_half = BX_CONST64(0x3fe0000000000000);
382 
float32_fixupimm(float32 dst,float32 op1,Bit32u op2,unsigned imm8,float_status_t & status)383 float32 float32_fixupimm(float32 dst, float32 op1, Bit32u op2, unsigned imm8, float_status_t &status)
384 {
385   float32 tmp_op1 = op1;
386   if (get_denormals_are_zeros(status))
387     tmp_op1 = float32_denormal_to_zero(op1);
388 
389   float_class_t op1_class = float32_class(tmp_op1);
390   int sign = float32_sign(tmp_op1);
391   unsigned token = 0, ie_fault_mask = 0, divz_fault_mask = 0;
392 
393   switch(op1_class)
394   {
395     case float_zero:
396       token = BX_FIXUPIMM_ZERO_VALUE_TOKEN;
397       divz_fault_mask = 0x01;
398         ie_fault_mask = 0x02;
399       break;
400 
401     case float_negative_inf:
402       token = BX_FIXUPIMM_NEG_INF_TOKEN;
403       ie_fault_mask = 0x20;
404       break;
405 
406     case float_positive_inf:
407       token = BX_FIXUPIMM_POS_INF_TOKEN;
408       ie_fault_mask = 0x80;
409       break;
410 
411     case float_SNaN:
412       token = BX_FIXUPIMM_SNAN_TOKEN;
413       ie_fault_mask = 0x10;
414       break;
415 
416     case float_QNaN:
417       token = BX_FIXUPIMM_QNAN_TOKEN;
418       break;
419 
420     case float_denormal:
421     case float_normalized:
422       if (tmp_op1 == float32_positive_one) {
423         token = BX_FIXUPIMM_POS_ONE_VALUE_TOKEN;
424         divz_fault_mask = 0x04;
425           ie_fault_mask = 0x08;
426       }
427       else {
428         if (sign) {
429           token = BX_FIXUPIMM_NEG_VALUE_TOKEN;
430           ie_fault_mask = 0x40;
431         }
432         else {
433           token = BX_FIXUPIMM_POS_VALUE_TOKEN;
434         }
435       }
436       break;
437 
438     default:
439         break;
440   }
441 
442   if (imm8 & ie_fault_mask)
443     float_raise(status, float_flag_invalid);
444 
445   if (imm8 & divz_fault_mask)
446     float_raise(status, float_flag_divbyzero);
447 
448   // access response table, each response is encoded with 4-bit value in the op2
449   unsigned token_response = (op2 >> (token*4)) & 0xf;
450 
451   switch(token_response) {
452   case 0x1: // apply DAZ to the op1 value
453     op1 = tmp_op1;
454     break;
455   case 0x2: op1 = convert_to_QNaN(tmp_op1); break;
456   case 0x3: op1 = float32_default_nan; break;
457   case 0x4: op1 = float32_negative_inf; break;
458   case 0x5: op1 = float32_positive_inf; break;
459   case 0x6:
460     op1 = sign ? float32_negative_inf : float32_positive_inf;
461     break;
462   case 0x7: op1 = float32_negative_zero; break;
463   case 0x8: op1 = float32_positive_zero; break;
464   case 0x9: op1 = float32_negative_one; break;
465   case 0xA: op1 = float32_positive_one; break;
466   case 0xB: op1 = float32_positive_half; break;
467   case 0xC: op1 = float32_value_90; break;
468   case 0xD: op1 = float32_pi_half; break;
469   case 0xE: op1 = float32_max_float; break;
470   case 0xF: op1 = float32_min_float; break;
471   default: // preserve the op1 value
472     op1 = dst; break;
473   }
474 
475   return op1;
476 }
477 
float64_fixupimm(float64 dst,float64 op1,Bit32u op2,unsigned imm8,float_status_t & status)478 float64 float64_fixupimm(float64 dst, float64 op1, Bit32u op2, unsigned imm8, float_status_t &status)
479 {
480   float64 tmp_op1 = op1;
481   if (get_denormals_are_zeros(status))
482     tmp_op1 = float64_denormal_to_zero(op1);
483 
484   float_class_t op1_class = float64_class(tmp_op1);
485   int sign = float64_sign(tmp_op1);
486   unsigned token = 0, ie_fault_mask = 0, divz_fault_mask = 0;
487 
488   switch(op1_class)
489   {
490     case float_zero:
491       token = BX_FIXUPIMM_ZERO_VALUE_TOKEN;
492       divz_fault_mask = 0x01;
493         ie_fault_mask = 0x02;
494       break;
495 
496     case float_negative_inf:
497       token = BX_FIXUPIMM_NEG_INF_TOKEN;
498       ie_fault_mask = 0x20;
499       break;
500 
501     case float_positive_inf:
502       token = BX_FIXUPIMM_POS_INF_TOKEN;
503       ie_fault_mask = 0x80;
504       break;
505 
506     case float_SNaN:
507       token = BX_FIXUPIMM_SNAN_TOKEN;
508       ie_fault_mask = 0x10;
509       break;
510 
511     case float_QNaN:
512       token = BX_FIXUPIMM_QNAN_TOKEN;
513       break;
514 
515     case float_denormal:
516     case float_normalized:
517       if (tmp_op1 == float64_positive_one) {
518         token = BX_FIXUPIMM_POS_ONE_VALUE_TOKEN;
519         divz_fault_mask = 0x04;
520           ie_fault_mask = 0x08;
521       }
522       else {
523         if (sign) {
524           token = BX_FIXUPIMM_NEG_VALUE_TOKEN;
525           ie_fault_mask = 0x40;
526         }
527         else {
528           token = BX_FIXUPIMM_POS_VALUE_TOKEN;
529         }
530       }
531       break;
532 
533     default:
534         break;
535   }
536 
537   if (imm8 & ie_fault_mask)
538     float_raise(status, float_flag_invalid);
539 
540   if (imm8 & divz_fault_mask)
541     float_raise(status, float_flag_divbyzero);
542 
543   // access response table, each response is encoded with 4-bit value in the op2
544   unsigned token_response = (op2 >> (token*4)) & 0xf;
545 
546   switch(token_response) {
547   case 0x1: // apply DAZ to the op1 value
548     op1 = tmp_op1;
549     break;
550   case 0x2: op1 = convert_to_QNaN(tmp_op1); break;
551   case 0x3: op1 = float64_default_nan; break;
552   case 0x4: op1 = float64_negative_inf; break;
553   case 0x5: op1 = float64_positive_inf; break;
554   case 0x6:
555     op1 = sign ? float64_negative_inf : float64_positive_inf;
556     break;
557   case 0x7: op1 = float64_negative_zero; break;
558   case 0x8: op1 = float64_positive_zero; break;
559   case 0x9: op1 = float64_negative_one; break;
560   case 0xA: op1 = float64_positive_one; break;
561   case 0xB: op1 = float64_positive_half; break;
562   case 0xC: op1 = float64_value_90; break;
563   case 0xD: op1 = float64_pi_half; break;
564   case 0xE: op1 = float64_max_float; break;
565   case 0xF: op1 = float64_min_float; break;
566   default: // preserve the op1 value
567     op1 = dst; break;
568   }
569 
570   return op1;
571 }
572 
VFIXUPIMMSS_MASK_VssHssWssIbR(bxInstruction_c * i)573 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMSS_MASK_VssHssWssIbR(bxInstruction_c *i)
574 {
575   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
576   Bit32u op_dst = BX_READ_XMM_REG_LO_DWORD(i->dst());
577 
578   if (i->opmask() == 0 || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
579     Bit32u op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
580 
581     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
582     softfloat_status_word_rc_override(status, i);
583     op1.xmm32u(0) = float32_fixupimm(op_dst, op1.xmm32u(0), op2, i->Ib(), status);
584     check_exceptionsSSE(get_exception_flags(status));
585   }
586   else {
587     if (i->isZeroMasking())
588       op1.xmm32u(0) = 0;
589     else
590       op1.xmm32u(0) = op_dst;
591   }
592 
593   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
594   BX_NEXT_INSTR(i);
595 }
596 
VFIXUPIMMSD_MASK_VsdHsdWsdIbR(bxInstruction_c * i)597 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMSD_MASK_VsdHsdWsdIbR(bxInstruction_c *i)
598 {
599   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
600   Bit64u op_dst = BX_READ_XMM_REG_LO_QWORD(i->dst());
601 
602   if (i->opmask() == 0 || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
603     Bit32u op2 = (Bit32u) BX_READ_XMM_REG_LO_QWORD(i->src2());
604 
605     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
606     softfloat_status_word_rc_override(status, i);
607     op1.xmm64u(0) = float64_fixupimm(op_dst, op1.xmm64u(0), op2, i->Ib(), status);
608     check_exceptionsSSE(get_exception_flags(status));
609   }
610   else {
611     if (i->isZeroMasking())
612       op1.xmm64u(0) = 0;
613     else
614       op1.xmm64u(0) = op_dst;
615   }
616 
617   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
618   BX_NEXT_INSTR(i);
619 }
620 
VFIXUPIMMPS_VpsHpsWpsIbR(bxInstruction_c * i)621 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMPS_VpsHpsWpsIbR(bxInstruction_c *i)
622 {
623   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
624   unsigned len = i->getVL();
625 
626   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
627   softfloat_status_word_rc_override(status, i);
628 
629   for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
630     op1.vmm32u(n) = float32_fixupimm(dst.vmm32u(n), op1.vmm32u(n), op2.vmm32u(n), i->Ib(), status);
631   }
632 
633   check_exceptionsSSE(get_exception_flags(status));
634 
635   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
636   BX_NEXT_INSTR(i);
637 }
638 
VFIXUPIMMPS_MASK_VpsHpsWpsIbR(bxInstruction_c * i)639 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMPS_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
640 {
641   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
642   Bit32u mask = BX_READ_16BIT_OPMASK(i->opmask());
643   unsigned len = i->getVL();
644 
645   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
646   softfloat_status_word_rc_override(status, i);
647 
648   for (unsigned n=0, tmp_mask = mask; n < DWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
649     if (tmp_mask & 0x1)
650       op1.vmm32u(n) = float32_fixupimm(dst.vmm32u(n), op1.vmm32u(n), op2.vmm32u(n), i->Ib(), status);
651     else
652       op1.vmm32u(n) = 0;
653   }
654 
655   check_exceptionsSSE(get_exception_flags(status));
656 
657   if (! i->isZeroMasking()) {
658     for (unsigned n=0; n < len; n++, mask >>= 4)
659       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), mask);
660 
661     BX_CLEAR_AVX_REGZ(i->dst(), len);
662   }
663   else {
664     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
665   }
666 
667   BX_NEXT_INSTR(i);
668 }
669 
VFIXUPIMMPD_VpdHpdWpdIbR(bxInstruction_c * i)670 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMPD_VpdHpdWpdIbR(bxInstruction_c *i)
671 {
672   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
673   unsigned len = i->getVL();
674 
675   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
676   softfloat_status_word_rc_override(status, i);
677 
678   for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
679     op1.vmm64u(n) = float64_fixupimm(dst.vmm64u(n), op1.vmm64u(n), (Bit32u) op2.vmm64u(n), i->Ib(), status);
680   }
681 
682   check_exceptionsSSE(get_exception_flags(status));
683 
684   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
685   BX_NEXT_INSTR(i);
686 }
687 
VFIXUPIMMPD_MASK_VpdHpdWpdIbR(bxInstruction_c * i)688 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFIXUPIMMPD_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
689 {
690   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
691   Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
692   unsigned len = i->getVL();
693 
694   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
695   softfloat_status_word_rc_override(status, i);
696 
697   for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
698     if (tmp_mask & 0x1)
699       op1.vmm64u(n) = float64_fixupimm(dst.vmm64u(n), op1.vmm64u(n), (Bit32u) op2.vmm64u(n), i->Ib(), status);
700     else
701       op1.vmm64u(n) = 0;
702   }
703 
704   check_exceptionsSSE(get_exception_flags(status));
705 
706   if (! i->isZeroMasking()) {
707     for (unsigned n=0; n < len; n++, mask >>= 2)
708       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), mask);
709 
710     BX_CLEAR_AVX_REGZ(i->dst(), len);
711   }
712   else {
713     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
714   }
715 
716   BX_NEXT_INSTR(i);
717 }
718 
719 // fpclass
720 
fpclass(float_class_t op_class,int sign,int selector)721 static int fpclass(float_class_t op_class, int sign, int selector)
722 {
723   return ((op_class == float_QNaN) && (selector & 0x01) != 0) || // QNaN
724          ((op_class == float_zero) && ! sign && (selector & 0x02) != 0) || // positive zero
725          ((op_class == float_zero) && sign && (selector & 0x04) != 0) || // negative zero
726          ((op_class == float_positive_inf) && (selector & 0x08) != 0) || // positive inf
727          ((op_class == float_negative_inf) && (selector & 0x10) != 0) || // negative inf
728          ((op_class == float_denormal) && (selector & 0x20) != 0) || // negative inf
729          ((op_class == float_denormal || op_class == float_normalized) && sign && (selector & 0x40) != 0) || // negative finite
730          ((op_class == float_SNaN) && (selector & 0x80) != 0); // SNaN
731 }
732 
float32_fpclass(float32 op,int selector,int daz)733 static BX_CPP_INLINE int float32_fpclass(float32 op, int selector, int daz)
734 {
735   if (daz)
736     op = float32_denormal_to_zero(op);
737 
738   return fpclass(float32_class(op), float32_sign(op), selector);
739 }
740 
float64_fpclass(float64 op,int selector,int daz)741 static BX_CPP_INLINE int float64_fpclass(float64 op, int selector, int daz)
742 {
743   if (daz)
744     op = float64_denormal_to_zero(op);
745 
746   return fpclass(float64_class(op), float64_sign(op), selector);
747 }
748 
VFPCLASSPS_MASK_KGwWpsIbR(bxInstruction_c * i)749 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFPCLASSPS_MASK_KGwWpsIbR(bxInstruction_c *i)
750 {
751   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
752   unsigned num_elements = DWORD_ELEMENTS(i->getVL());
753 
754   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
755   Bit32u result = 0;
756   int selector = i->Ib(), daz = MXCSR.get_DAZ();
757 
758   for (unsigned n=0, mask = 0x1; n < num_elements; n++, mask <<= 1) {
759     if (opmask & mask) {
760       if (float32_fpclass(op.vmm32u(n), selector, daz)) result |= mask;
761     }
762   }
763 
764   BX_WRITE_OPMASK(i->dst(), result);
765   BX_NEXT_INSTR(i);
766 }
767 
VFPCLASSPD_MASK_KGbWpdIbR(bxInstruction_c * i)768 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFPCLASSPD_MASK_KGbWpdIbR(bxInstruction_c *i)
769 {
770   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
771   unsigned num_elements = QWORD_ELEMENTS(i->getVL());
772 
773   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
774   Bit32u result = 0;
775   int selector = i->Ib(), daz = MXCSR.get_DAZ();
776 
777   for (unsigned n=0, mask = 0x1; n < num_elements; n++, mask <<= 1) {
778     if (opmask & mask) {
779       if (float64_fpclass(op.vmm64u(n), selector, daz)) result |= mask;
780     }
781   }
782 
783   BX_WRITE_OPMASK(i->dst(), result);
784   BX_NEXT_INSTR(i);
785 }
786 
VFPCLASSSS_MASK_KGbWssIbR(bxInstruction_c * i)787 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFPCLASSSS_MASK_KGbWssIbR(bxInstruction_c *i)
788 {
789   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
790     BX_WRITE_OPMASK(i->dst(), float32_fpclass(BX_READ_XMM_REG_LO_DWORD(i->src()), i->Ib(), MXCSR.get_DAZ()));
791   }
792   else {
793     BX_WRITE_OPMASK(i->dst(), 0);
794   }
795 
796   BX_NEXT_INSTR(i);
797 }
798 
VFPCLASSSD_MASK_KGbWsdIbR(bxInstruction_c * i)799 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VFPCLASSSD_MASK_KGbWsdIbR(bxInstruction_c *i)
800 {
801   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
802     BX_WRITE_OPMASK(i->dst(), float64_fpclass(BX_READ_XMM_REG_LO_QWORD(i->src()), i->Ib(), MXCSR.get_DAZ()));
803   }
804   else {
805     BX_WRITE_OPMASK(i->dst(), 0);
806   }
807 
808   BX_NEXT_INSTR(i);
809 }
810 
811 // getexp
812 
VGETEXPPS_MASK_VpsWpsR(bxInstruction_c * i)813 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETEXPPS_MASK_VpsWpsR(bxInstruction_c *i)
814 {
815   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
816   Bit32u mask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
817   unsigned len = i->getVL();
818 
819   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
820   softfloat_status_word_rc_override(status, i);
821 
822   for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 4)
823     xmm_getexpps_mask(&op.vmm128(n), status, tmp_mask);
824 
825   check_exceptionsSSE(get_exception_flags(status));
826 
827   if (! i->isZeroMasking()) {
828     for (unsigned n=0; n < len; n++, mask >>= 4)
829       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), mask);
830     BX_CLEAR_AVX_REGZ(i->dst(), len);
831   }
832   else {
833     BX_WRITE_AVX_REGZ(i->dst(), op, len);
834   }
835 
836   BX_NEXT_INSTR(i);
837 }
838 
VGETEXPPD_MASK_VpdWpdR(bxInstruction_c * i)839 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETEXPPD_MASK_VpdWpdR(bxInstruction_c *i)
840 {
841   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
842   Bit32u mask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
843   unsigned len = i->getVL();
844 
845   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
846   softfloat_status_word_rc_override(status, i);
847 
848   for (unsigned n=0, tmp_mask = mask; n < len; n++, tmp_mask >>= 2)
849     xmm_getexppd_mask(&op.vmm128(n), status, tmp_mask);
850 
851   check_exceptionsSSE(get_exception_flags(status));
852 
853   if (! i->isZeroMasking()) {
854     for (unsigned n=0; n < len; n++, mask >>= 2)
855       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), mask);
856     BX_CLEAR_AVX_REGZ(i->dst(), len);
857   }
858   else {
859     BX_WRITE_AVX_REGZ(i->dst(), op, len);
860   }
861 
862   BX_NEXT_INSTR(i);
863 }
864 
VGETEXPSS_MASK_VssHpsWssR(bxInstruction_c * i)865 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETEXPSS_MASK_VssHpsWssR(bxInstruction_c *i)
866 {
867   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
868 
869   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
870     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
871 
872     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
873     softfloat_status_word_rc_override(status, i);
874     op1.xmm32u(0) = float32_getexp(op2, status);
875     check_exceptionsSSE(get_exception_flags(status));
876   }
877   else {
878     if (i->isZeroMasking())
879       op1.xmm32u(0) = 0;
880     else
881       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
882   }
883 
884   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
885   BX_NEXT_INSTR(i);
886 }
887 
VGETEXPSD_MASK_VsdHpdWsdR(bxInstruction_c * i)888 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETEXPSD_MASK_VsdHpdWsdR(bxInstruction_c *i)
889 {
890   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
891 
892   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
893     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
894 
895     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
896     softfloat_status_word_rc_override(status, i);
897     op1.xmm64u(0) = float64_getexp(op2, status);
898     check_exceptionsSSE(get_exception_flags(status));
899   }
900   else {
901     if (i->isZeroMasking())
902       op1.xmm64u(0) = 0;
903     else
904       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
905   }
906 
907   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
908   BX_NEXT_INSTR(i);
909 }
910 
911 // getmant
912 
VGETMANTSS_MASK_VssHpsWssIbR(bxInstruction_c * i)913 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETMANTSS_MASK_VssHpsWssIbR(bxInstruction_c *i)
914 {
915   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
916 
917   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
918     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
919 
920     int sign_ctrl = (i->Ib() >> 2) & 0x3;
921     int interv = i->Ib() & 0x3;
922 
923     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
924     softfloat_status_word_rc_override(status, i);
925     op1.xmm32u(0) = float32_getmant(op2, status, sign_ctrl, interv);
926     check_exceptionsSSE(get_exception_flags(status));
927   }
928   else {
929     if (i->isZeroMasking())
930       op1.xmm32u(0) = 0;
931     else
932       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
933   }
934 
935   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
936   BX_NEXT_INSTR(i);
937 }
938 
VGETMANTSD_MASK_VsdHpdWsdIbR(bxInstruction_c * i)939 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETMANTSD_MASK_VsdHpdWsdIbR(bxInstruction_c *i)
940 {
941   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
942 
943   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
944     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
945 
946     int sign_ctrl = (i->Ib() >> 2) & 0x3;
947     int interv = i->Ib() & 0x3;
948 
949     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
950     softfloat_status_word_rc_override(status, i);
951     op1.xmm64u(0) = float64_getmant(op2, status, sign_ctrl, interv);
952     check_exceptionsSSE(get_exception_flags(status));
953   }
954   else {
955     if (i->isZeroMasking())
956       op1.xmm64u(0) = 0;
957     else
958       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
959   }
960 
961   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
962   BX_NEXT_INSTR(i);
963 }
964 
VGETMANTPS_MASK_VpsWpsIbR(bxInstruction_c * i)965 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETMANTPS_MASK_VpsWpsIbR(bxInstruction_c *i)
966 {
967   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
968   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
969   unsigned len = i->getVL();
970 
971   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
972   softfloat_status_word_rc_override(status, i);
973 
974   int sign_ctrl = (i->Ib() >> 2) & 0x3;
975   int interv = i->Ib() & 0x3;
976 
977   for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) {
978     if (opmask & mask)
979       op.vmm32u(n) = float32_getmant(op.vmm32u(n), status, sign_ctrl, interv);
980     else
981       op.vmm32u(n) = 0;
982   }
983 
984   check_exceptionsSSE(get_exception_flags(status));
985 
986   if (! i->isZeroMasking()) {
987     for (unsigned n=0; n < len; n++, opmask >>= 4)
988       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
989     BX_CLEAR_AVX_REGZ(i->dst(), len);
990   }
991   else {
992     BX_WRITE_AVX_REGZ(i->dst(), op, len);
993   }
994 
995   BX_NEXT_INSTR(i);
996 }
997 
VGETMANTPD_MASK_VpdWpdIbR(bxInstruction_c * i)998 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETMANTPD_MASK_VpdWpdIbR(bxInstruction_c *i)
999 {
1000   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1001   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1002   unsigned len = i->getVL();
1003 
1004   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1005   softfloat_status_word_rc_override(status, i);
1006 
1007   int sign_ctrl = (i->Ib() >> 2) & 0x3;
1008   int interv = i->Ib() & 0x3;
1009 
1010   for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
1011     if (opmask & mask)
1012       op.vmm64u(n) = float64_getmant(op.vmm64u(n), status, sign_ctrl, interv);
1013     else
1014       op.vmm64u(n) = 0;
1015   }
1016 
1017   check_exceptionsSSE(get_exception_flags(status));
1018 
1019   if (! i->isZeroMasking()) {
1020     for (unsigned n=0; n < len; n++, opmask >>= 2)
1021       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
1022     BX_CLEAR_AVX_REGZ(i->dst(), len);
1023   }
1024   else {
1025     BX_WRITE_AVX_REGZ(i->dst(), op, len);
1026   }
1027 
1028   BX_NEXT_INSTR(i);
1029 }
1030 
1031 // rndscale
1032 
VRNDSCALEPS_MASK_VpsWpsIbR(bxInstruction_c * i)1033 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALEPS_MASK_VpsWpsIbR(bxInstruction_c *i)
1034 {
1035   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1036   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1037   unsigned len = i->getVL();
1038 
1039   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1040   softfloat_status_word_rc_override(status, i);
1041 
1042   Bit8u control = i->Ib(), scale = control >> 4;
1043 
1044   // override MXCSR rounding mode with control coming from imm8
1045   if ((control & 0x4) == 0)
1046     status.float_rounding_mode = control & 0x3;
1047   // ignore precision exception result
1048   if (control & 0x8)
1049     status.float_suppress_exception |= float_flag_inexact;
1050 
1051   for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) {
1052     if (opmask & mask)
1053       op.vmm32u(n) = float32_round_to_int(op.vmm32u(n), scale, status);
1054     else
1055       op.vmm32u(n) = 0;
1056   }
1057 
1058   check_exceptionsSSE(get_exception_flags(status));
1059 
1060   if (! i->isZeroMasking()) {
1061     for (unsigned n=0; n < len; n++, opmask >>= 4)
1062       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
1063     BX_CLEAR_AVX_REGZ(i->dst(), len);
1064   }
1065   else {
1066     BX_WRITE_AVX_REGZ(i->dst(), op, len);
1067   }
1068 
1069   BX_NEXT_INSTR(i);
1070 }
1071 
VRNDSCALESS_MASK_VssHpsWssIbR(bxInstruction_c * i)1072 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALESS_MASK_VssHpsWssIbR(bxInstruction_c *i)
1073 {
1074   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1075 
1076   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1077     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
1078 
1079     Bit8u control = i->Ib(), scale = control >> 4;
1080 
1081     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1082     softfloat_status_word_rc_override(status, i);
1083 
1084     // override MXCSR rounding mode with control coming from imm8
1085     if ((control & 0x4) == 0)
1086       status.float_rounding_mode = control & 0x3;
1087     // ignore precision exception result
1088     if (control & 0x8)
1089       status.float_suppress_exception |= float_flag_inexact;
1090 
1091     op1.xmm32u(0) = float32_round_to_int(op2, scale, status);
1092 
1093     check_exceptionsSSE(get_exception_flags(status));
1094   }
1095   else {
1096     if (i->isZeroMasking())
1097       op1.xmm32u(0) = 0;
1098     else
1099       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
1100   }
1101 
1102   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1103   BX_NEXT_INSTR(i);
1104 }
1105 
VRNDSCALEPD_MASK_VpdWpdIbR(bxInstruction_c * i)1106 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALEPD_MASK_VpdWpdIbR(bxInstruction_c *i)
1107 {
1108   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1109   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1110   unsigned len = i->getVL();
1111 
1112   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1113   softfloat_status_word_rc_override(status, i);
1114 
1115   Bit8u control = i->Ib(), scale = control >> 4;
1116 
1117   // override MXCSR rounding mode with control coming from imm8
1118   if ((control & 0x4) == 0)
1119     status.float_rounding_mode = control & 0x3;
1120   // ignore precision exception result
1121   if (control & 0x8)
1122     status.float_suppress_exception |= float_flag_inexact;
1123 
1124   for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
1125     if (opmask & mask)
1126       op.vmm64u(n) = float64_round_to_int(op.vmm64u(n), scale, status);
1127     else
1128       op.vmm64u(n) = 0;
1129   }
1130 
1131   check_exceptionsSSE(get_exception_flags(status));
1132 
1133   if (! i->isZeroMasking()) {
1134     for (unsigned n=0; n < len; n++, opmask >>= 2)
1135       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
1136     BX_CLEAR_AVX_REGZ(i->dst(), len);
1137   }
1138   else {
1139     BX_WRITE_AVX_REGZ(i->dst(), op, len);
1140   }
1141 
1142   BX_NEXT_INSTR(i);
1143 }
1144 
VRNDSCALESD_MASK_VsdHpdWsdIbR(bxInstruction_c * i)1145 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALESD_MASK_VsdHpdWsdIbR(bxInstruction_c *i)
1146 {
1147   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1148 
1149   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1150     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
1151 
1152     Bit8u control = i->Ib(), scale = control >> 4;
1153 
1154     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1155     softfloat_status_word_rc_override(status, i);
1156 
1157     // override MXCSR rounding mode with control coming from imm8
1158     if ((control & 0x4) == 0)
1159       status.float_rounding_mode = control & 0x3;
1160     // ignore precision exception result
1161     if (control & 0x8)
1162       status.float_suppress_exception |= float_flag_inexact;
1163 
1164     op1.xmm64u(0) = float64_round_to_int(op2, scale, status);
1165 
1166     check_exceptionsSSE(get_exception_flags(status));
1167   }
1168   else {
1169     if (i->isZeroMasking())
1170       op1.xmm64u(0) = 0;
1171     else
1172       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
1173   }
1174 
1175   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1176   BX_NEXT_INSTR(i);
1177 }
1178 
1179 // scalef
1180 
VSCALEFPS_VpsHpsWpsR(bxInstruction_c * i)1181 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPS_VpsHpsWpsR(bxInstruction_c *i)
1182 {
1183   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1184   unsigned len = i->getVL();
1185 
1186   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1187   softfloat_status_word_rc_override(status, i);
1188 
1189   for (unsigned n=0; n < len; n++) {
1190     xmm_scalefps(&op1.vmm128(n), &op2.vmm128(n), status);
1191   }
1192 
1193   check_exceptionsSSE(get_exception_flags(status));
1194 
1195   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1196 
1197   BX_NEXT_INSTR(i);
1198 }
1199 
VSCALEFPD_VpdHpdWpdR(bxInstruction_c * i)1200 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPD_VpdHpdWpdR(bxInstruction_c *i)
1201 {
1202   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1203   unsigned len = i->getVL();
1204 
1205   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1206   softfloat_status_word_rc_override(status, i);
1207 
1208   for (unsigned n=0; n < len; n++) {
1209     xmm_scalefpd(&op1.vmm128(n), &op2.vmm128(n), status);
1210   }
1211 
1212   check_exceptionsSSE(get_exception_flags(status));
1213 
1214   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1215 
1216   BX_NEXT_INSTR(i);
1217 }
1218 
VSCALEFSS_VssHpsWssR(bxInstruction_c * i)1219 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFSS_VssHpsWssR(bxInstruction_c *i)
1220 {
1221   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1222   float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
1223 
1224   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1225   softfloat_status_word_rc_override(status, i);
1226 
1227   op1.xmm32u(0) = float32_scalef(op1.xmm32u(0), op2, status);
1228 
1229   check_exceptionsSSE(get_exception_flags(status));
1230   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1231 
1232   BX_NEXT_INSTR(i);
1233 }
1234 
VSCALEFSD_VsdHpdWsdR(bxInstruction_c * i)1235 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFSD_VsdHpdWsdR(bxInstruction_c *i)
1236 {
1237   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1238   float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
1239 
1240   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1241   softfloat_status_word_rc_override(status, i);
1242 
1243   op1.xmm64u(0) = float64_scalef(op1.xmm64u(0), op2, status);
1244 
1245   check_exceptionsSSE(get_exception_flags(status));
1246   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1247 
1248   BX_NEXT_INSTR(i);
1249 }
1250 
1251 // range
1252 
float32_range(float32 a,float32 b,int opselect,int sign_ctrl,float_status_t & status)1253 static BX_CPP_INLINE float32 float32_range(float32 a, float32 b, int opselect, int sign_ctrl, float_status_t &status)
1254 {
1255   float32 minmax = float32_minmax(a, b, opselect & 0x1, (opselect >> 1) & 0x1, status);
1256 
1257   if (! float32_is_signaling_nan(a) && ! float32_is_signaling_nan(b)) {
1258     if (sign_ctrl == 0) {
1259       minmax = (minmax & ~0x80000000) | (a & 0x80000000); // keep sign of a
1260     }
1261     else if (sign_ctrl == 2) {
1262       minmax &= ~0x80000000; // zero out sign it
1263     }
1264     else if (sign_ctrl == 3) {
1265       minmax |=  0x80000000; // set the sign it
1266     }
1267     // else preserve the sign of compare result
1268   }
1269 
1270   return minmax;
1271 }
1272 
float64_range(float64 a,float64 b,int opselect,int sign_ctrl,float_status_t & status)1273 static BX_CPP_INLINE float64 float64_range(float64 a, float64 b, int opselect, int sign_ctrl, float_status_t &status)
1274 {
1275   float64 minmax = float64_minmax(a, b, opselect & 0x1, (opselect >> 1) & 0x1, status);
1276 
1277   if (! float64_is_signaling_nan(a) && ! float64_is_signaling_nan(b)) {
1278     if (sign_ctrl == 0) {
1279       minmax = (minmax & ~BX_CONST64(0x8000000000000000)) | (a & BX_CONST64(0x8000000000000000)); // keep sign of a
1280     }
1281     else if (sign_ctrl == 2) {
1282       minmax &= ~BX_CONST64(0x8000000000000000); // zero out sign it
1283     }
1284     else if (sign_ctrl == 3) {
1285       minmax |=  BX_CONST64(0x8000000000000000); // set the sign it
1286     }
1287     // else preserve the sign of compare result
1288   }
1289 
1290   return minmax;
1291 }
1292 
VRANGEPS_MASK_VpsHpsWpsIbR(bxInstruction_c * i)1293 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRANGEPS_MASK_VpsHpsWpsIbR(bxInstruction_c *i)
1294 {
1295   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1296   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1297   unsigned len = i->getVL();
1298 
1299   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1300   softfloat_status_word_rc_override(status, i);
1301 
1302   int sign_ctrl = (i->Ib() >> 2) & 0x3;
1303   int opselect = i->Ib() & 0x3;
1304 
1305   for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) {
1306     if (opmask & mask)
1307       op1.vmm32u(n) = float32_range(op1.vmm32u(n), op2.vmm32u(n), opselect, sign_ctrl, status);
1308     else
1309       op1.vmm32u(n) = 0;
1310   }
1311 
1312   check_exceptionsSSE(get_exception_flags(status));
1313 
1314   if (! i->isZeroMasking()) {
1315     for (unsigned n=0; n < len; n++, opmask >>= 4)
1316       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), opmask);
1317     BX_CLEAR_AVX_REGZ(i->dst(), len);
1318   }
1319   else {
1320     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1321   }
1322 
1323   BX_NEXT_INSTR(i);
1324 }
1325 
VRANGEPD_MASK_VpdHpdWpdIbR(bxInstruction_c * i)1326 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRANGEPD_MASK_VpdHpdWpdIbR(bxInstruction_c *i)
1327 {
1328   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
1329   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1330   unsigned len = i->getVL();
1331 
1332   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1333   softfloat_status_word_rc_override(status, i);
1334 
1335   int sign_ctrl = (i->Ib() >> 2) & 0x3;
1336   int opselect = i->Ib() & 0x3;
1337 
1338   for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
1339     if (opmask & mask)
1340       op1.vmm64u(n) = float64_range(op1.vmm64u(n), op2.vmm64u(n), opselect, sign_ctrl, status);
1341     else
1342       op1.vmm64u(n) = 0;
1343   }
1344 
1345   check_exceptionsSSE(get_exception_flags(status));
1346 
1347   if (! i->isZeroMasking()) {
1348     for (unsigned n=0; n < len; n++, opmask >>= 2)
1349       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op1.vmm128(n), opmask);
1350     BX_CLEAR_AVX_REGZ(i->dst(), len);
1351   }
1352   else {
1353     BX_WRITE_AVX_REGZ(i->dst(), op1, len);
1354   }
1355 
1356   BX_NEXT_INSTR(i);
1357 }
1358 
VRANGESS_MASK_VssHpsWssIbR(bxInstruction_c * i)1359 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRANGESS_MASK_VssHpsWssIbR(bxInstruction_c *i)
1360 {
1361   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1362 
1363   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1364     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
1365 
1366     int sign_ctrl = (i->Ib() >> 2) & 0x3;
1367     int opselect = i->Ib() & 0x3;
1368 
1369     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1370     softfloat_status_word_rc_override(status, i);
1371     op1.xmm32u(0) = float32_range(op1.xmm32u(0), op2, opselect, sign_ctrl, status);
1372     check_exceptionsSSE(get_exception_flags(status));
1373   }
1374   else {
1375     if (i->isZeroMasking())
1376       op1.xmm32u(0) = 0;
1377     else
1378       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
1379   }
1380 
1381   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1382   BX_NEXT_INSTR(i);
1383 }
1384 
VRANGESD_MASK_VsdHpdWsdIbR(bxInstruction_c * i)1385 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VRANGESD_MASK_VsdHpdWsdIbR(bxInstruction_c *i)
1386 {
1387   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1388 
1389   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1390     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
1391 
1392     int sign_ctrl = (i->Ib() >> 2) & 0x3;
1393     int opselect = i->Ib() & 0x3;
1394 
1395     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1396     softfloat_status_word_rc_override(status, i);
1397     op1.xmm64u(0) = float64_range(op1.xmm64u(0), op2, opselect, sign_ctrl, status);
1398     check_exceptionsSSE(get_exception_flags(status));
1399   }
1400   else {
1401     if (i->isZeroMasking())
1402       op1.xmm64u(0) = 0;
1403     else
1404       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
1405   }
1406 
1407   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1408   BX_NEXT_INSTR(i);
1409 }
1410 
1411 // reduce
1412 
float32_reduce(float32 a,Bit8u scale,float_status_t & status)1413 static BX_CPP_INLINE float32 float32_reduce(float32 a, Bit8u scale, float_status_t &status)
1414 {
1415   if (a == float32_negative_inf || a == float32_positive_inf)
1416     return 0;
1417 
1418   float32 tmp = float32_round_to_int(a, scale, status);
1419   return float32_sub(a, tmp, status);
1420 }
1421 
float64_reduce(float64 a,Bit8u scale,float_status_t & status)1422 static BX_CPP_INLINE float64 float64_reduce(float64 a, Bit8u scale, float_status_t &status)
1423 {
1424   if (a == float64_negative_inf || a == float64_positive_inf)
1425     return 0;
1426 
1427   float64 tmp = float64_round_to_int(a, scale, status);
1428   return float64_sub(a, tmp, status);
1429 }
1430 
VREDUCEPS_MASK_VpsWpsIbR(bxInstruction_c * i)1431 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VREDUCEPS_MASK_VpsWpsIbR(bxInstruction_c *i)
1432 {
1433   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1434   Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1435   unsigned len = i->getVL();
1436 
1437   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1438   softfloat_status_word_rc_override(status, i);
1439 
1440   Bit8u control = i->Ib(), scale = control >> 4;
1441 
1442   // override MXCSR rounding mode with control coming from imm8
1443   if ((control & 0x4) == 0)
1444     status.float_rounding_mode = control & 0x3;
1445   // ignore precision exception result
1446   if (control & 0x8)
1447     status.float_suppress_exception |= float_flag_inexact;
1448 
1449   for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) {
1450     if (opmask & mask)
1451       op.vmm32u(n) = float32_reduce(op.vmm32u(n), scale, status);
1452     else
1453       op.vmm32u(n) = 0;
1454   }
1455 
1456   check_exceptionsSSE(get_exception_flags(status));
1457 
1458   if (! i->isZeroMasking()) {
1459     for (unsigned n=0; n < len; n++, opmask >>= 4)
1460       xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
1461     BX_CLEAR_AVX_REGZ(i->dst(), len);
1462   }
1463   else {
1464     BX_WRITE_AVX_REGZ(i->dst(), op, len);
1465   }
1466 
1467   BX_NEXT_INSTR(i);
1468 }
1469 
VREDUCESS_MASK_VssHpsWssIbR(bxInstruction_c * i)1470 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VREDUCESS_MASK_VssHpsWssIbR(bxInstruction_c *i)
1471 {
1472   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1473 
1474   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1475     float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2());
1476 
1477     Bit8u control = i->Ib(), scale = control >> 4;
1478 
1479     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1480     softfloat_status_word_rc_override(status, i);
1481 
1482     // override MXCSR rounding mode with control coming from imm8
1483     if ((control & 0x4) == 0)
1484       status.float_rounding_mode = control & 0x3;
1485     // ignore precision exception result
1486     if (control & 0x8)
1487       status.float_suppress_exception |= float_flag_inexact;
1488 
1489     op1.xmm32u(0) = float32_reduce(op2, scale, status);
1490 
1491     check_exceptionsSSE(get_exception_flags(status));
1492   }
1493   else {
1494     if (i->isZeroMasking())
1495       op1.xmm32u(0) = 0;
1496     else
1497       op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst());
1498   }
1499 
1500   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1501   BX_NEXT_INSTR(i);
1502 }
1503 
VREDUCEPD_MASK_VpdWpdIbR(bxInstruction_c * i)1504 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VREDUCEPD_MASK_VpdWpdIbR(bxInstruction_c *i)
1505 {
1506   BxPackedAvxRegister op = BX_READ_AVX_REG(i->src());
1507   Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1;
1508   unsigned len = i->getVL();
1509 
1510   float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1511   softfloat_status_word_rc_override(status, i);
1512 
1513   Bit8u control = i->Ib(), scale = control >> 4;
1514 
1515   // override MXCSR rounding mode with control coming from imm8
1516   if ((control & 0x4) == 0)
1517     status.float_rounding_mode = control & 0x3;
1518   // ignore precision exception result
1519   if (control & 0x8)
1520     status.float_suppress_exception |= float_flag_inexact;
1521 
1522   for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
1523     if (opmask & mask)
1524       op.vmm64u(n) = float64_reduce(op.vmm64u(n), scale, status);
1525     else
1526       op.vmm64u(n) = 0;
1527   }
1528 
1529   check_exceptionsSSE(get_exception_flags(status));
1530 
1531   if (! i->isZeroMasking()) {
1532     for (unsigned n=0; n < len; n++, opmask >>= 2)
1533       xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask);
1534     BX_CLEAR_AVX_REGZ(i->dst(), len);
1535   }
1536   else {
1537     BX_WRITE_AVX_REGZ(i->dst(), op, len);
1538   }
1539 
1540   BX_NEXT_INSTR(i);
1541 }
1542 
VREDUCESD_MASK_VsdHpdWsdIbR(bxInstruction_c * i)1543 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VREDUCESD_MASK_VsdHpdWsdIbR(bxInstruction_c *i)
1544 {
1545   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
1546 
1547   if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) {
1548     float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2());
1549 
1550     Bit8u control = i->Ib(), scale = control >> 4;
1551 
1552     float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
1553     softfloat_status_word_rc_override(status, i);
1554 
1555     // override MXCSR rounding mode with control coming from imm8
1556     if ((control & 0x4) == 0)
1557       status.float_rounding_mode = control & 0x3;
1558     // ignore precision exception result
1559     if (control & 0x8)
1560       status.float_suppress_exception |= float_flag_inexact;
1561 
1562     op1.xmm64u(0) = float64_reduce(op2, scale, status);
1563 
1564     check_exceptionsSSE(get_exception_flags(status));
1565   }
1566   else {
1567     if (i->isZeroMasking())
1568       op1.xmm64u(0) = 0;
1569     else
1570       op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst());
1571   }
1572 
1573   BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
1574   BX_NEXT_INSTR(i);
1575 }
1576 
1577 #endif
1578