xref: /qemu/target/i386/ops_sse.h (revision 29b62a10)
1 /*
2  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3  *
4  *  Copyright (c) 2005 Fabrice Bellard
5  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "crypto/aes.h"
22 
23 #if SHIFT == 0
24 #define Reg MMXReg
25 #define XMM_ONLY(...)
26 #define B(n) MMX_B(n)
27 #define W(n) MMX_W(n)
28 #define L(n) MMX_L(n)
29 #define Q(n) MMX_Q(n)
30 #define SUFFIX _mmx
31 #else
32 #define Reg ZMMReg
33 #define XMM_ONLY(...) __VA_ARGS__
34 #define B(n) ZMM_B(n)
35 #define W(n) ZMM_W(n)
36 #define L(n) ZMM_L(n)
37 #define Q(n) ZMM_Q(n)
38 #if SHIFT == 1
39 #define SUFFIX _xmm
40 #else
41 #define SUFFIX _ymm
42 #endif
43 #endif
44 
45 #define LANE_WIDTH (SHIFT ? 16 : 8)
46 #define PACK_WIDTH (LANE_WIDTH / 2)
47 
48 #if SHIFT == 0
49 #define FPSRL(x, c) ((x) >> shift)
50 #define FPSRAW(x, c) ((int16_t)(x) >> shift)
51 #define FPSRAL(x, c) ((int32_t)(x) >> shift)
52 #define FPSLL(x, c) ((x) << shift)
53 #endif
54 
55 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
56 {
57     int shift;
58     if (c->Q(0) > 15) {
59         for (int i = 0; i < 1 << SHIFT; i++) {
60             d->Q(i) = 0;
61         }
62     } else {
63         shift = c->B(0);
64         for (int i = 0; i < 4 << SHIFT; i++) {
65             d->W(i) = FPSRL(s->W(i), shift);
66         }
67     }
68 }
69 
70 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
71 {
72     int shift;
73     if (c->Q(0) > 15) {
74         for (int i = 0; i < 1 << SHIFT; i++) {
75             d->Q(i) = 0;
76         }
77     } else {
78         shift = c->B(0);
79         for (int i = 0; i < 4 << SHIFT; i++) {
80             d->W(i) = FPSLL(s->W(i), shift);
81         }
82     }
83 }
84 
85 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
86 {
87     int shift;
88     if (c->Q(0) > 15) {
89         shift = 15;
90     } else {
91         shift = c->B(0);
92     }
93     for (int i = 0; i < 4 << SHIFT; i++) {
94         d->W(i) = FPSRAW(s->W(i), shift);
95     }
96 }
97 
98 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
99 {
100     int shift;
101     if (c->Q(0) > 31) {
102         for (int i = 0; i < 1 << SHIFT; i++) {
103             d->Q(i) = 0;
104         }
105     } else {
106         shift = c->B(0);
107         for (int i = 0; i < 2 << SHIFT; i++) {
108             d->L(i) = FPSRL(s->L(i), shift);
109         }
110     }
111 }
112 
113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
114 {
115     int shift;
116     if (c->Q(0) > 31) {
117         for (int i = 0; i < 1 << SHIFT; i++) {
118             d->Q(i) = 0;
119         }
120     } else {
121         shift = c->B(0);
122         for (int i = 0; i < 2 << SHIFT; i++) {
123             d->L(i) = FPSLL(s->L(i), shift);
124         }
125     }
126 }
127 
128 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
129 {
130     int shift;
131     if (c->Q(0) > 31) {
132         shift = 31;
133     } else {
134         shift = c->B(0);
135     }
136     for (int i = 0; i < 2 << SHIFT; i++) {
137         d->L(i) = FPSRAL(s->L(i), shift);
138     }
139 }
140 
141 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
142 {
143     int shift;
144     if (c->Q(0) > 63) {
145         for (int i = 0; i < 1 << SHIFT; i++) {
146             d->Q(i) = 0;
147         }
148     } else {
149         shift = c->B(0);
150         for (int i = 0; i < 1 << SHIFT; i++) {
151             d->Q(i) = FPSRL(s->Q(i), shift);
152         }
153     }
154 }
155 
156 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
157 {
158     int shift;
159     if (c->Q(0) > 63) {
160         for (int i = 0; i < 1 << SHIFT; i++) {
161             d->Q(i) = 0;
162         }
163     } else {
164         shift = c->B(0);
165         for (int i = 0; i < 1 << SHIFT; i++) {
166             d->Q(i) = FPSLL(s->Q(i), shift);
167         }
168     }
169 }
170 
171 #if SHIFT >= 1
172 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
173 {
174     int shift, i, j;
175 
176     shift = c->L(0);
177     if (shift > 16) {
178         shift = 16;
179     }
180     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
181         for (i = 0; i < 16 - shift; i++) {
182             d->B(j + i) = s->B(j + i + shift);
183         }
184         for (i = 16 - shift; i < 16; i++) {
185             d->B(j + i) = 0;
186         }
187     }
188 }
189 
190 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
191 {
192     int shift, i, j;
193 
194     shift = c->L(0);
195     if (shift > 16) {
196         shift = 16;
197     }
198     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
199         for (i = 15; i >= shift; i--) {
200             d->B(j + i) = s->B(j + i - shift);
201         }
202         for (i = 0; i < shift; i++) {
203             d->B(j + i) = 0;
204         }
205     }
206 }
207 #endif
208 
209 #define SSE_HELPER_1(name, elem, num, F)                        \
210     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
211     {                                                           \
212         int n = num;                                            \
213         for (int i = 0; i < n; i++) {                           \
214             d->elem(i) = F(s->elem(i));                         \
215         }                                                       \
216     }
217 
218 #define SSE_HELPER_2(name, elem, num, F)                        \
219     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)   \
220     {                                                           \
221         int n = num;                                            \
222         for (int i = 0; i < n; i++) {                           \
223             d->elem(i) = F(v->elem(i), s->elem(i));             \
224         }                                                       \
225     }
226 
227 #define SSE_HELPER_B(name, F)                                   \
228     SSE_HELPER_2(name, B, 8 << SHIFT, F)
229 
230 #define SSE_HELPER_W(name, F)                                   \
231     SSE_HELPER_2(name, W, 4 << SHIFT, F)
232 
233 #define SSE_HELPER_L(name, F)                                   \
234     SSE_HELPER_2(name, L, 2 << SHIFT, F)
235 
236 #define SSE_HELPER_Q(name, F)                                   \
237     SSE_HELPER_2(name, Q, 1 << SHIFT, F)
238 
239 #if SHIFT == 0
240 static inline int satub(int x)
241 {
242     if (x < 0) {
243         return 0;
244     } else if (x > 255) {
245         return 255;
246     } else {
247         return x;
248     }
249 }
250 
251 static inline int satuw(int x)
252 {
253     if (x < 0) {
254         return 0;
255     } else if (x > 65535) {
256         return 65535;
257     } else {
258         return x;
259     }
260 }
261 
262 static inline int satsb(int x)
263 {
264     if (x < -128) {
265         return -128;
266     } else if (x > 127) {
267         return 127;
268     } else {
269         return x;
270     }
271 }
272 
273 static inline int satsw(int x)
274 {
275     if (x < -32768) {
276         return -32768;
277     } else if (x > 32767) {
278         return 32767;
279     } else {
280         return x;
281     }
282 }
283 
284 #define FADD(a, b) ((a) + (b))
285 #define FADDUB(a, b) satub((a) + (b))
286 #define FADDUW(a, b) satuw((a) + (b))
287 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
288 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
289 
290 #define FSUB(a, b) ((a) - (b))
291 #define FSUBUB(a, b) satub((a) - (b))
292 #define FSUBUW(a, b) satuw((a) - (b))
293 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
294 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
295 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
296 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
297 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
298 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
299 
300 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
301 #define FMULHUW(a, b) ((a) * (b) >> 16)
302 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
303 
304 #define FAVG(a, b) (((a) + (b) + 1) >> 1)
305 #endif
306 
307 SSE_HELPER_W(helper_pmulhuw, FMULHUW)
308 SSE_HELPER_W(helper_pmulhw, FMULHW)
309 
310 #if SHIFT == 0
311 void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
312 {
313     d->W(0) = FMULHRW(d->W(0), s->W(0));
314     d->W(1) = FMULHRW(d->W(1), s->W(1));
315     d->W(2) = FMULHRW(d->W(2), s->W(2));
316     d->W(3) = FMULHRW(d->W(3), s->W(3));
317 }
318 #endif
319 
320 SSE_HELPER_B(helper_pavgb, FAVG)
321 SSE_HELPER_W(helper_pavgw, FAVG)
322 
323 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
324 {
325     int i;
326 
327     for (i = 0; i < (1 << SHIFT); i++) {
328         d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
329     }
330 }
331 
332 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
333 {
334     int i;
335 
336     for (i = 0; i < (2 << SHIFT); i++) {
337         d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
338             (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
339     }
340 }
341 
342 #if SHIFT == 0
343 static inline int abs1(int a)
344 {
345     if (a < 0) {
346         return -a;
347     } else {
348         return a;
349     }
350 }
351 #endif
352 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
353 {
354     int i;
355 
356     for (i = 0; i < (1 << SHIFT); i++) {
357         unsigned int val = 0;
358         val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
359         val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
360         val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
361         val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
362         val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
363         val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
364         val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
365         val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
366         d->Q(i) = val;
367     }
368 }
369 
370 #if SHIFT < 2
371 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
372                                   target_ulong a0)
373 {
374     int i;
375 
376     for (i = 0; i < (8 << SHIFT); i++) {
377         if (s->B(i) & 0x80) {
378             cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
379         }
380     }
381 }
382 #endif
383 
384 #define SHUFFLE4(F, a, b, offset) do {      \
385     r0 = a->F((order & 3) + offset);        \
386     r1 = a->F(((order >> 2) & 3) + offset); \
387     r2 = b->F(((order >> 4) & 3) + offset); \
388     r3 = b->F(((order >> 6) & 3) + offset); \
389     d->F(offset) = r0;                      \
390     d->F(offset + 1) = r1;                  \
391     d->F(offset + 2) = r2;                  \
392     d->F(offset + 3) = r3;                  \
393     } while (0)
394 
395 #if SHIFT == 0
396 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
397 {
398     uint16_t r0, r1, r2, r3;
399 
400     SHUFFLE4(W, s, s, 0);
401 }
402 #else
403 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
404 {
405     uint32_t r0, r1, r2, r3;
406     int i;
407 
408     for (i = 0; i < 2 << SHIFT; i += 4) {
409         SHUFFLE4(L, v, s, i);
410     }
411 }
412 
413 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
414 {
415     uint64_t r0, r1;
416     int i;
417 
418     for (i = 0; i < 1 << SHIFT; i += 2) {
419         r0 = v->Q(((order & 1) & 1) + i);
420         r1 = s->Q(((order >> 1) & 1) + i);
421         d->Q(i) = r0;
422         d->Q(i + 1) = r1;
423         order >>= 2;
424     }
425 }
426 
427 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
428 {
429     uint32_t r0, r1, r2, r3;
430     int i;
431 
432     for (i = 0; i < 2 << SHIFT; i += 4) {
433         SHUFFLE4(L, s, s, i);
434     }
435 }
436 
437 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
438 {
439     uint16_t r0, r1, r2, r3;
440     int i, j;
441 
442     for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
443         SHUFFLE4(W, s, s, i);
444         d->Q(j) = s->Q(j);
445     }
446 }
447 
448 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
449 {
450     uint16_t r0, r1, r2, r3;
451     int i, j;
452 
453     for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
454         d->Q(j) = s->Q(j);
455         SHUFFLE4(W, s, s, i);
456     }
457 }
458 #endif
459 
460 #if SHIFT >= 1
461 /* FPU ops */
462 /* XXX: not accurate */
463 
464 #define SSE_HELPER_P(name, F)                                           \
465     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
466             Reg *d, Reg *v, Reg *s)                                     \
467     {                                                                   \
468         int i;                                                          \
469         for (i = 0; i < 2 << SHIFT; i++) {                              \
470             d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i));              \
471         }                                                               \
472     }                                                                   \
473                                                                         \
474     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
475             Reg *d, Reg *v, Reg *s)                                     \
476     {                                                                   \
477         int i;                                                          \
478         for (i = 0; i < 1 << SHIFT; i++) {                              \
479             d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i));              \
480         }                                                               \
481     }
482 
483 #if SHIFT == 1
484 
485 #define SSE_HELPER_S(name, F)                                           \
486     SSE_HELPER_P(name, F)                                               \
487                                                                         \
488     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
489     {                                                                   \
490         int i;                                                          \
491         d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0));                  \
492         for (i = 1; i < 2 << SHIFT; i++) {                              \
493             d->ZMM_L(i) = v->ZMM_L(i);                                  \
494         }                                                               \
495     }                                                                   \
496                                                                         \
497     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
498     {                                                                   \
499         int i;                                                          \
500         d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0));                  \
501         for (i = 1; i < 1 << SHIFT; i++) {                              \
502             d->ZMM_Q(i) = v->ZMM_Q(i);                                  \
503         }                                                               \
504     }
505 
506 #else
507 
508 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
509 
510 #endif
511 
512 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
513 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
514 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
515 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
516 
517 /* Note that the choice of comparison op here is important to get the
518  * special cases right: for min and max Intel specifies that (-0,0),
519  * (NaN, anything) and (anything, NaN) return the second argument.
520  */
521 #define FPU_MIN(size, a, b)                                     \
522     (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
523 #define FPU_MAX(size, a, b)                                     \
524     (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
525 
526 SSE_HELPER_S(add, FPU_ADD)
527 SSE_HELPER_S(sub, FPU_SUB)
528 SSE_HELPER_S(mul, FPU_MUL)
529 SSE_HELPER_S(div, FPU_DIV)
530 SSE_HELPER_S(min, FPU_MIN)
531 SSE_HELPER_S(max, FPU_MAX)
532 
533 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
534 {
535     int i;
536     for (i = 0; i < 2 << SHIFT; i++) {
537         d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
538     }
539 }
540 
541 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
542 {
543     int i;
544     for (i = 0; i < 1 << SHIFT; i++) {
545         d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
546     }
547 }
548 
549 #if SHIFT == 1
550 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
551 {
552     int i;
553     d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
554     for (i = 1; i < 2 << SHIFT; i++) {
555         d->ZMM_L(i) = v->ZMM_L(i);
556     }
557 }
558 
559 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
560 {
561     int i;
562     d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
563     for (i = 1; i < 1 << SHIFT; i++) {
564         d->ZMM_Q(i) = v->ZMM_Q(i);
565     }
566 }
567 #endif
568 
569 /* float to float conversions */
570 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
571 {
572     int i;
573     for (i = 1 << SHIFT; --i >= 0; ) {
574         d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
575     }
576 }
577 
578 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
579 {
580     int i;
581     for (i = 0; i < 1 << SHIFT; i++) {
582          d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
583     }
584     for (i >>= 1; i < 1 << SHIFT; i++) {
585          d->Q(i) = 0;
586     }
587 }
588 
589 #if SHIFT >= 1
590 void glue(helper_cvtph2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
591 {
592     int i;
593 
594     for (i = 2 << SHIFT; --i >= 0; ) {
595          d->ZMM_S(i) = float16_to_float32(s->ZMM_H(i), true, &env->sse_status);
596     }
597 }
598 
599 void glue(helper_cvtps2ph, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, int mode)
600 {
601     int i;
602     FloatRoundMode prev_rounding_mode = env->sse_status.float_rounding_mode;
603     if (!(mode & (1 << 2))) {
604         set_x86_rounding_mode(mode & 3, &env->sse_status);
605     }
606 
607     for (i = 0; i < 2 << SHIFT; i++) {
608         d->ZMM_H(i) = float32_to_float16(s->ZMM_S(i), true, &env->sse_status);
609     }
610     for (i >>= 2; i < 1 << SHIFT; i++) {
611         d->Q(i) = 0;
612     }
613 
614     env->sse_status.float_rounding_mode = prev_rounding_mode;
615 }
616 #endif
617 
618 #if SHIFT == 1
619 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
620 {
621     int i;
622     d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
623     for (i = 1; i < 1 << SHIFT; i++) {
624         d->ZMM_Q(i) = v->ZMM_Q(i);
625     }
626 }
627 
628 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
629 {
630     int i;
631     d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
632     for (i = 1; i < 2 << SHIFT; i++) {
633         d->ZMM_L(i) = v->ZMM_L(i);
634     }
635 }
636 #endif
637 
638 /* integer to float */
639 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
640 {
641     int i;
642     for (i = 0; i < 2 << SHIFT; i++) {
643         d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
644     }
645 }
646 
647 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
648 {
649     int i;
650     for (i = 1 << SHIFT; --i >= 0; ) {
651         int32_t l = s->ZMM_L(i);
652         d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
653     }
654 }
655 
656 #if SHIFT == 1
657 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
658 {
659     d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
660     d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
661 }
662 
663 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
664 {
665     d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
666     d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
667 }
668 
669 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
670 {
671     d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
672 }
673 
674 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
675 {
676     d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
677 }
678 
679 #ifdef TARGET_X86_64
680 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
681 {
682     d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
683 }
684 
685 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
686 {
687     d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
688 }
689 #endif
690 
691 #endif
692 
693 /* float to integer */
694 
695 #if SHIFT == 1
696 /*
697  * x86 mandates that we return the indefinite integer value for the result
698  * of any float-to-integer conversion that raises the 'invalid' exception.
699  * Wrap the softfloat functions to get this behaviour.
700  */
701 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
702     static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
703     {                                                                   \
704         int oldflags, newflags;                                         \
705         RETTYPE r;                                                      \
706                                                                         \
707         oldflags = get_float_exception_flags(s);                        \
708         set_float_exception_flags(0, s);                                \
709         r = FN(a, s);                                                   \
710         newflags = get_float_exception_flags(s);                        \
711         if (newflags & float_flag_invalid) {                            \
712             r = INDEFVALUE;                                             \
713         }                                                               \
714         set_float_exception_flags(newflags | oldflags, s);              \
715         return r;                                                       \
716     }
717 
718 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
719 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
720 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
721 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
722 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
723 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
724 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
725 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
726 #endif
727 
728 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
729 {
730     int i;
731     for (i = 0; i < 2 << SHIFT; i++) {
732         d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
733     }
734 }
735 
736 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
737 {
738     int i;
739     for (i = 0; i < 1 << SHIFT; i++) {
740         d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
741     }
742     for (i >>= 1; i < 1 << SHIFT; i++) {
743          d->Q(i) = 0;
744     }
745 }
746 
747 #if SHIFT == 1
748 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
749 {
750     d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
751     d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
752 }
753 
754 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
755 {
756     d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
757     d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
758 }
759 
760 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
761 {
762     return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
763 }
764 
765 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
766 {
767     return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
768 }
769 
770 #ifdef TARGET_X86_64
771 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
772 {
773     return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
774 }
775 
776 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
777 {
778     return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
779 }
780 #endif
781 #endif
782 
783 /* float to integer truncated */
784 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
785 {
786     int i;
787     for (i = 0; i < 2 << SHIFT; i++) {
788         d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
789                                                          &env->sse_status);
790     }
791 }
792 
793 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
794 {
795     int i;
796     for (i = 0; i < 1 << SHIFT; i++) {
797         d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
798                                                          &env->sse_status);
799     }
800     for (i >>= 1; i < 1 << SHIFT; i++) {
801          d->Q(i) = 0;
802     }
803 }
804 
805 #if SHIFT == 1
806 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
807 {
808     d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
809     d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
810 }
811 
812 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
813 {
814     d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
815     d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
816 }
817 
818 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
819 {
820     return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
821 }
822 
823 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
824 {
825     return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
826 }
827 
828 #ifdef TARGET_X86_64
829 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
830 {
831     return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
832 }
833 
834 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
835 {
836     return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
837 }
838 #endif
839 #endif
840 
841 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
842 {
843     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
844     int i;
845     for (i = 0; i < 2 << SHIFT; i++) {
846         d->ZMM_S(i) = float32_div(float32_one,
847                                   float32_sqrt(s->ZMM_S(i), &env->sse_status),
848                                   &env->sse_status);
849     }
850     set_float_exception_flags(old_flags, &env->sse_status);
851 }
852 
853 #if SHIFT == 1
854 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
855 {
856     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
857     int i;
858     d->ZMM_S(0) = float32_div(float32_one,
859                               float32_sqrt(s->ZMM_S(0), &env->sse_status),
860                               &env->sse_status);
861     set_float_exception_flags(old_flags, &env->sse_status);
862     for (i = 1; i < 2 << SHIFT; i++) {
863         d->ZMM_L(i) = v->ZMM_L(i);
864     }
865 }
866 #endif
867 
868 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
869 {
870     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
871     int i;
872     for (i = 0; i < 2 << SHIFT; i++) {
873         d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
874     }
875     set_float_exception_flags(old_flags, &env->sse_status);
876 }
877 
878 #if SHIFT == 1
879 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
880 {
881     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
882     int i;
883     d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
884     for (i = 1; i < 2 << SHIFT; i++) {
885         d->ZMM_L(i) = v->ZMM_L(i);
886     }
887     set_float_exception_flags(old_flags, &env->sse_status);
888 }
889 #endif
890 
891 #if SHIFT == 1
892 static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
893 {
894     uint64_t mask;
895 
896     if (len == 0) {
897         mask = ~0LL;
898     } else {
899         mask = (1ULL << len) - 1;
900     }
901     return (src >> shift) & mask;
902 }
903 
904 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
905 {
906     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63);
907 }
908 
909 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
910 {
911     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
912 }
913 
914 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len)
915 {
916     uint64_t mask;
917 
918     if (len == 0) {
919         mask = ~0ULL;
920     } else {
921         mask = (1ULL << len) - 1;
922     }
923     return (dest & ~(mask << shift)) | ((src & mask) << shift);
924 }
925 
926 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
927 {
928     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63);
929 }
930 
931 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length)
932 {
933     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length);
934 }
935 #endif
936 
937 #define SSE_HELPER_HPS(name, F)  \
938 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
939 {                                                                 \
940     float32 r[2 << SHIFT];                                        \
941     int i, j, k;                                                  \
942     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {            \
943         for (i = j = 0; j < 4; i++, j += 2) {                     \
944             r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
945         }                                                         \
946         for (j = 0; j < 4; i++, j += 2) {                         \
947             r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
948         }                                                         \
949     }                                                             \
950     for (i = 0; i < 2 << SHIFT; i++) {                            \
951         d->ZMM_S(i) = r[i];                                       \
952     }                                                             \
953 }
954 
955 SSE_HELPER_HPS(haddps, float32_add)
956 SSE_HELPER_HPS(hsubps, float32_sub)
957 
958 #define SSE_HELPER_HPD(name, F)  \
959 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
960 {                                                                 \
961     float64 r[1 << SHIFT];                                        \
962     int i, j, k;                                                  \
963     for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) {            \
964         for (i = j = 0; j < 2; i++, j += 2) {                     \
965             r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
966         }                                                         \
967         for (j = 0; j < 2; i++, j += 2) {                         \
968             r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
969         }                                                         \
970     }                                                             \
971     for (i = 0; i < 1 << SHIFT; i++) {                            \
972         d->ZMM_D(i) = r[i];                                       \
973     }                                                             \
974 }
975 
976 SSE_HELPER_HPD(haddpd, float64_add)
977 SSE_HELPER_HPD(hsubpd, float64_sub)
978 
979 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
980 {
981     int i;
982     for (i = 0; i < 2 << SHIFT; i += 2) {
983         d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
984         d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
985     }
986 }
987 
988 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
989 {
990     int i;
991     for (i = 0; i < 1 << SHIFT; i += 2) {
992         d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
993         d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
994     }
995 }
996 
997 #define SSE_HELPER_CMP_P(name, F, C)                                    \
998     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
999                                              Reg *d, Reg *v, Reg *s)    \
1000     {                                                                   \
1001         int i;                                                          \
1002         for (i = 0; i < 2 << SHIFT; i++) {                              \
1003             d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0;  \
1004         }                                                               \
1005     }                                                                   \
1006                                                                         \
1007     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
1008                                              Reg *d, Reg *v, Reg *s)    \
1009     {                                                                   \
1010         int i;                                                          \
1011         for (i = 0; i < 1 << SHIFT; i++) {                              \
1012             d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0;  \
1013         }                                                               \
1014     }
1015 
1016 #if SHIFT == 1
1017 #define SSE_HELPER_CMP(name, F, C)                                          \
1018     SSE_HELPER_CMP_P(name, F, C)                                            \
1019     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
1020     {                                                                       \
1021         int i;                                                              \
1022         d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0;          \
1023         for (i = 1; i < 2 << SHIFT; i++) {                                  \
1024             d->ZMM_L(i) = v->ZMM_L(i);                                      \
1025         }                                                                   \
1026     }                                                                       \
1027                                                                             \
1028     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
1029     {                                                                       \
1030         int i;                                                              \
1031         d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0;          \
1032         for (i = 1; i < 1 << SHIFT; i++) {                                  \
1033             d->ZMM_Q(i) = v->ZMM_Q(i);                                      \
1034         }                                                                   \
1035     }
1036 
1037 static inline bool FPU_EQU(FloatRelation x)
1038 {
1039     return (x == float_relation_equal || x == float_relation_unordered);
1040 }
1041 static inline bool FPU_GE(FloatRelation x)
1042 {
1043     return (x == float_relation_equal || x == float_relation_greater);
1044 }
1045 #define FPU_EQ(x) (x == float_relation_equal)
1046 #define FPU_LT(x) (x == float_relation_less)
1047 #define FPU_LE(x) (x <= float_relation_equal)
1048 #define FPU_GT(x) (x == float_relation_greater)
1049 #define FPU_UNORD(x) (x == float_relation_unordered)
1050 /* We must make sure we evaluate the argument in case it is a signalling NAN */
1051 #define FPU_FALSE(x) (x == float_relation_equal && 0)
1052 
1053 #define FPU_CMPQ(size, a, b) \
1054     float ## size ## _compare_quiet(a, b, &env->sse_status)
1055 #define FPU_CMPS(size, a, b) \
1056     float ## size ## _compare(a, b, &env->sse_status)
1057 
1058 #else
1059 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
1060 #endif
1061 
1062 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
1063 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
1064 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
1065 SSE_HELPER_CMP(cmpunord, FPU_CMPQ,  FPU_UNORD)
1066 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
1067 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
1068 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
1069 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
1070 
1071 SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
1072 SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
1073 SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
1074 SSE_HELPER_CMP(cmpfalse, FPU_CMPQ,  FPU_FALSE)
1075 SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
1076 SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
1077 SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
1078 SSE_HELPER_CMP(cmptrue, FPU_CMPQ,  !FPU_FALSE)
1079 
1080 SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
1081 SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
1082 SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
1083 SSE_HELPER_CMP(cmpunords, FPU_CMPS,  FPU_UNORD)
1084 SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
1085 SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
1086 SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
1087 SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
1088 
1089 SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
1090 SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
1091 SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
1092 SSE_HELPER_CMP(cmpfalses, FPU_CMPS,  FPU_FALSE)
1093 SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
1094 SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
1095 SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
1096 SSE_HELPER_CMP(cmptrues, FPU_CMPS,  !FPU_FALSE)
1097 
1098 #undef SSE_HELPER_CMP
1099 
1100 #if SHIFT == 1
1101 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
1102 
1103 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
1104 {
1105     FloatRelation ret;
1106     float32 s0, s1;
1107 
1108     s0 = d->ZMM_S(0);
1109     s1 = s->ZMM_S(0);
1110     ret = float32_compare_quiet(s0, s1, &env->sse_status);
1111     CC_SRC = comis_eflags[ret + 1];
1112 }
1113 
1114 void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
1115 {
1116     FloatRelation ret;
1117     float32 s0, s1;
1118 
1119     s0 = d->ZMM_S(0);
1120     s1 = s->ZMM_S(0);
1121     ret = float32_compare(s0, s1, &env->sse_status);
1122     CC_SRC = comis_eflags[ret + 1];
1123 }
1124 
1125 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
1126 {
1127     FloatRelation ret;
1128     float64 d0, d1;
1129 
1130     d0 = d->ZMM_D(0);
1131     d1 = s->ZMM_D(0);
1132     ret = float64_compare_quiet(d0, d1, &env->sse_status);
1133     CC_SRC = comis_eflags[ret + 1];
1134 }
1135 
1136 void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
1137 {
1138     FloatRelation ret;
1139     float64 d0, d1;
1140 
1141     d0 = d->ZMM_D(0);
1142     d1 = s->ZMM_D(0);
1143     ret = float64_compare(d0, d1, &env->sse_status);
1144     CC_SRC = comis_eflags[ret + 1];
1145 }
1146 #endif
1147 
1148 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
1149 {
1150     uint32_t mask;
1151     int i;
1152 
1153     mask = 0;
1154     for (i = 0; i < 2 << SHIFT; i++) {
1155         mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
1156     }
1157     return mask;
1158 }
1159 
1160 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
1161 {
1162     uint32_t mask;
1163     int i;
1164 
1165     mask = 0;
1166     for (i = 0; i < 1 << SHIFT; i++) {
1167         mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
1168     }
1169     return mask;
1170 }
1171 
1172 #endif
1173 
1174 #define PACK_HELPER_B(name, F) \
1175 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env,      \
1176         Reg *d, Reg *v, Reg *s)                               \
1177 {                                                             \
1178     uint8_t r[PACK_WIDTH * 2];                                \
1179     int j, k;                                                 \
1180     for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) {            \
1181         for (k = 0; k < PACK_WIDTH; k++) {                    \
1182             r[k] = F((int16_t)v->W(j + k));                   \
1183         }                                                     \
1184         for (k = 0; k < PACK_WIDTH; k++) {                    \
1185             r[PACK_WIDTH + k] = F((int16_t)s->W(j + k));      \
1186         }                                                     \
1187         for (k = 0; k < PACK_WIDTH * 2; k++) {                \
1188             d->B(2 * j + k) = r[k];                           \
1189         }                                                     \
1190     }                                                         \
1191 }
1192 
1193 PACK_HELPER_B(sswb, satsb)
1194 PACK_HELPER_B(uswb, satub)
1195 
1196 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1197 {
1198     uint16_t r[PACK_WIDTH];
1199     int j, k;
1200 
1201     for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
1202         for (k = 0; k < PACK_WIDTH / 2; k++) {
1203             r[k] = satsw(v->L(j + k));
1204         }
1205         for (k = 0; k < PACK_WIDTH / 2; k++) {
1206             r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
1207         }
1208         for (k = 0; k < PACK_WIDTH; k++) {
1209             d->W(2 * j + k) = r[k];
1210         }
1211     }
1212 }
1213 
1214 #define UNPCK_OP(base_name, base)                                       \
1215                                                                         \
1216     void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
1217                                                 Reg *d, Reg *v, Reg *s) \
1218     {                                                                   \
1219         uint8_t r[PACK_WIDTH * 2];                                      \
1220         int j, i;                                                       \
1221                                                                         \
1222         for (j = 0; j < 8 << SHIFT; ) {                                 \
1223             int k = j + base * PACK_WIDTH;                              \
1224             for (i = 0; i < PACK_WIDTH; i++) {                          \
1225                 r[2 * i] = v->B(k + i);                                 \
1226                 r[2 * i + 1] = s->B(k + i);                             \
1227             }                                                           \
1228             for (i = 0; i < PACK_WIDTH * 2; i++, j++) {                 \
1229                 d->B(j) = r[i];                                         \
1230             }                                                           \
1231         }                                                               \
1232     }                                                                   \
1233                                                                         \
1234     void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
1235                                                 Reg *d, Reg *v, Reg *s) \
1236     {                                                                   \
1237         uint16_t r[PACK_WIDTH];                                         \
1238         int j, i;                                                       \
1239                                                                         \
1240         for (j = 0; j < 4 << SHIFT; ) {                                 \
1241             int k = j + base * PACK_WIDTH / 2;                          \
1242             for (i = 0; i < PACK_WIDTH / 2; i++) {                      \
1243                 r[2 * i] = v->W(k + i);                                 \
1244                 r[2 * i + 1] = s->W(k + i);                             \
1245             }                                                           \
1246             for (i = 0; i < PACK_WIDTH; i++, j++) {                     \
1247                 d->W(j) = r[i];                                         \
1248             }                                                           \
1249         }                                                               \
1250     }                                                                   \
1251                                                                         \
1252     void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
1253                                                 Reg *d, Reg *v, Reg *s) \
1254     {                                                                   \
1255         uint32_t r[PACK_WIDTH / 2];                                     \
1256         int j, i;                                                       \
1257                                                                         \
1258         for (j = 0; j < 2 << SHIFT; ) {                                 \
1259             int k = j + base * PACK_WIDTH / 4;                          \
1260             for (i = 0; i < PACK_WIDTH / 4; i++) {                      \
1261                 r[2 * i] = v->L(k + i);                                 \
1262                 r[2 * i + 1] = s->L(k + i);                             \
1263             }                                                           \
1264             for (i = 0; i < PACK_WIDTH / 2; i++, j++) {                 \
1265                 d->L(j) = r[i];                                         \
1266             }                                                           \
1267         }                                                               \
1268     }                                                                   \
1269                                                                         \
1270     XMM_ONLY(                                                           \
1271              void glue(helper_punpck ## base_name ## qdq, SUFFIX)(      \
1272                         CPUX86State *env, Reg *d, Reg *v, Reg *s)       \
1273              {                                                          \
1274                  uint64_t r[2];                                         \
1275                  int i;                                                 \
1276                                                                         \
1277                  for (i = 0; i < 1 << SHIFT; i += 2) {                  \
1278                      r[0] = v->Q(base + i);                             \
1279                      r[1] = s->Q(base + i);                             \
1280                      d->Q(i) = r[0];                                    \
1281                      d->Q(i + 1) = r[1];                                \
1282                  }                                                      \
1283              }                                                          \
1284                                                                         )
1285 
1286 UNPCK_OP(l, 0)
1287 UNPCK_OP(h, 1)
1288 
1289 #undef PACK_WIDTH
1290 #undef PACK_HELPER_B
1291 #undef UNPCK_OP
1292 
1293 
1294 /* 3DNow! float ops */
1295 #if SHIFT == 0
1296 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
1297 {
1298     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1299     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1300 }
1301 
1302 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
1303 {
1304     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1305     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1306 }
1307 
1308 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
1309 {
1310     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1311     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1312 }
1313 
1314 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
1315 {
1316     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1317                                                        &env->mmx_status));
1318     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1319                                                        &env->mmx_status));
1320 }
1321 
1322 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1323 {
1324     float32 r;
1325 
1326     r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1327     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1328     d->MMX_S(0) = r;
1329 }
1330 
1331 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
1332 {
1333     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1334     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1335 }
1336 
1337 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
1338 {
1339     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1340                                    &env->mmx_status) ? -1 : 0;
1341     d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1342                                    &env->mmx_status) ? -1 : 0;
1343 }
1344 
1345 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
1346 {
1347     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1348                              &env->mmx_status) ? -1 : 0;
1349     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1350                              &env->mmx_status) ? -1 : 0;
1351 }
1352 
1353 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
1354 {
1355     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1356                              &env->mmx_status) ? -1 : 0;
1357     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1358                              &env->mmx_status) ? -1 : 0;
1359 }
1360 
1361 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
1362 {
1363     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
1364         d->MMX_S(0) = s->MMX_S(0);
1365     }
1366     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
1367         d->MMX_S(1) = s->MMX_S(1);
1368     }
1369 }
1370 
1371 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
1372 {
1373     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
1374         d->MMX_S(0) = s->MMX_S(0);
1375     }
1376     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
1377         d->MMX_S(1) = s->MMX_S(1);
1378     }
1379 }
1380 
1381 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
1382 {
1383     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1384     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1385 }
1386 
1387 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1388 {
1389     float32 r;
1390 
1391     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1392     d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1393     d->MMX_S(0) = r;
1394 }
1395 
1396 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1397 {
1398     float32 r;
1399 
1400     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1401     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1402     d->MMX_S(0) = r;
1403 }
1404 
1405 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
1406 {
1407     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
1408     d->MMX_S(1) = d->MMX_S(0);
1409 }
1410 
1411 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
1412 {
1413     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1414     d->MMX_S(1) = float32_div(float32_one,
1415                               float32_sqrt(d->MMX_S(1), &env->mmx_status),
1416                               &env->mmx_status);
1417     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1418     d->MMX_L(0) = d->MMX_L(1);
1419 }
1420 
1421 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
1422 {
1423     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1424     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1425 }
1426 
1427 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
1428 {
1429     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1430     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1431 }
1432 
1433 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
1434 {
1435     uint32_t r;
1436 
1437     r = s->MMX_L(0);
1438     d->MMX_L(0) = s->MMX_L(1);
1439     d->MMX_L(1) = r;
1440 }
1441 #endif
1442 
1443 /* SSSE3 op helpers */
1444 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1445 {
1446     int i;
1447 #if SHIFT == 0
1448     uint8_t r[8];
1449 
1450     for (i = 0; i < 8; i++) {
1451         r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
1452     }
1453     for (i = 0; i < 8; i++) {
1454         d->B(i) = r[i];
1455     }
1456 #else
1457     uint8_t r[8 << SHIFT];
1458 
1459     for (i = 0; i < 8 << SHIFT; i++) {
1460         int j = i & ~0xf;
1461         r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
1462     }
1463     for (i = 0; i < 8 << SHIFT; i++) {
1464         d->B(i) = r[i];
1465     }
1466 #endif
1467 }
1468 
1469 #define SSE_HELPER_HW(name, F)  \
1470 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
1471 {                                                          \
1472     uint16_t r[4 << SHIFT];                                \
1473     int i, j, k;                                           \
1474     for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) {     \
1475         for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1476             r[i + k] = F(v->W(j + k), v->W(j + k + 1));    \
1477         }                                                  \
1478         for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) {     \
1479             r[i + k] = F(s->W(j + k), s->W(j + k + 1));    \
1480         }                                                  \
1481     }                                                      \
1482     for (i = 0; i < 4 << SHIFT; i++) {                     \
1483         d->W(i) = r[i];                                    \
1484     }                                                      \
1485 }
1486 
1487 #define SSE_HELPER_HL(name, F)  \
1488 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
1489 {                                                          \
1490     uint32_t r[2 << SHIFT];                                \
1491     int i, j, k;                                           \
1492     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {     \
1493         for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1494             r[i + k] = F(v->L(j + k), v->L(j + k + 1));    \
1495         }                                                  \
1496         for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) {     \
1497             r[i + k] = F(s->L(j + k), s->L(j + k + 1));    \
1498         }                                                  \
1499     }                                                      \
1500     for (i = 0; i < 2 << SHIFT; i++) {                     \
1501         d->L(i) = r[i];                                    \
1502     }                                                      \
1503 }
1504 
1505 SSE_HELPER_HW(phaddw, FADD)
1506 SSE_HELPER_HW(phsubw, FSUB)
1507 SSE_HELPER_HW(phaddsw, FADDSW)
1508 SSE_HELPER_HW(phsubsw, FSUBSW)
1509 SSE_HELPER_HL(phaddd, FADD)
1510 SSE_HELPER_HL(phsubd, FSUB)
1511 
1512 #undef SSE_HELPER_HW
1513 #undef SSE_HELPER_HL
1514 
1515 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1516 {
1517     int i;
1518     for (i = 0; i < 4 << SHIFT; i++) {
1519         d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
1520                         (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
1521     }
1522 }
1523 
1524 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
1525 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1526 
1527 #define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
1528 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1529 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
1530 SSE_HELPER_B(helper_psignb, FSIGNB)
1531 SSE_HELPER_W(helper_psignw, FSIGNW)
1532 SSE_HELPER_L(helper_psignd, FSIGNL)
1533 
1534 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1535                                   uint32_t imm)
1536 {
1537     int i;
1538 
1539     /* XXX could be checked during translation */
1540     if (imm >= (SHIFT ? 32 : 16)) {
1541         for (i = 0; i < (1 << SHIFT); i++) {
1542             d->Q(i) = 0;
1543         }
1544     } else {
1545         int shift = imm * 8;
1546 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1547 #if SHIFT == 0
1548         d->Q(0) = SHR(s->Q(0), shift - 0) |
1549             SHR(v->Q(0), shift -  64);
1550 #else
1551         for (i = 0; i < (1 << SHIFT); i += 2) {
1552             uint64_t r0, r1;
1553 
1554             r0 = SHR(s->Q(i), shift - 0) |
1555                  SHR(s->Q(i + 1), shift -  64) |
1556                  SHR(v->Q(i), shift - 128) |
1557                  SHR(v->Q(i + 1), shift - 192);
1558             r1 = SHR(s->Q(i), shift + 64) |
1559                  SHR(s->Q(i + 1), shift -   0) |
1560                  SHR(v->Q(i), shift -  64) |
1561                  SHR(v->Q(i + 1), shift - 128);
1562             d->Q(i) = r0;
1563             d->Q(i + 1) = r1;
1564         }
1565 #endif
1566 #undef SHR
1567     }
1568 }
1569 
1570 #if SHIFT >= 1
1571 
1572 #define SSE_HELPER_V(name, elem, num, F)                                \
1573     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
1574                             Reg *m)                                     \
1575     {                                                                   \
1576         int i;                                                          \
1577         for (i = 0; i < num; i++) {                                     \
1578             d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i));         \
1579         }                                                               \
1580     }
1581 
1582 #define SSE_HELPER_I(name, elem, num, F)                                \
1583     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
1584                             uint32_t imm)                               \
1585     {                                                                   \
1586         int i;                                                          \
1587         for (i = 0; i < num; i++) {                                     \
1588             int j = i & 7;                                              \
1589             d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1);     \
1590         }                                                               \
1591     }
1592 
1593 /* SSE4.1 op helpers */
1594 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v)
1595 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v)
1596 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v)
1597 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB)
1598 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS)
1599 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD)
1600 
1601 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1602 {
1603     uint64_t zf = 0, cf = 0;
1604     int i;
1605 
1606     for (i = 0; i < 1 << SHIFT; i++) {
1607         zf |= (s->Q(i) &  d->Q(i));
1608         cf |= (s->Q(i) & ~d->Q(i));
1609     }
1610     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1611 }
1612 
1613 #define FMOVSLDUP(i) s->L((i) & ~1)
1614 #define FMOVSHDUP(i) s->L((i) | 1)
1615 #define FMOVDLDUP(i) s->Q((i) & ~1)
1616 
1617 #define SSE_HELPER_F(name, elem, num, F)                        \
1618     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
1619     {                                                           \
1620         int n = num;                                            \
1621         for (int i = n; --i >= 0; ) {                           \
1622             d->elem(i) = F(i);                                  \
1623         }                                                       \
1624     }
1625 
1626 #if SHIFT > 0
1627 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
1628 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
1629 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
1630 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
1631 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
1632 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
1633 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
1634 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
1635 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
1636 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
1637 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
1638 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
1639 SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP)
1640 SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP)
1641 SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP)
1642 #endif
1643 
1644 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1645 {
1646     int i;
1647 
1648     for (i = 0; i < 1 << SHIFT; i++) {
1649         d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
1650     }
1651 }
1652 
1653 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
1654 {
1655     uint16_t r[8];
1656     int i, j, k;
1657 
1658     for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
1659         r[0] = satuw(v->L(j));
1660         r[1] = satuw(v->L(j + 1));
1661         r[2] = satuw(v->L(j + 2));
1662         r[3] = satuw(v->L(j + 3));
1663         r[4] = satuw(s->L(j));
1664         r[5] = satuw(s->L(j + 1));
1665         r[6] = satuw(s->L(j + 2));
1666         r[7] = satuw(s->L(j + 3));
1667         for (k = 0; k < 8; k++) {
1668             d->W(i + k) = r[k];
1669         }
1670     }
1671 }
1672 
1673 #if SHIFT == 1
1674 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1675 {
1676     int idx = 0;
1677 
1678     if (s->W(1) < s->W(idx)) {
1679         idx = 1;
1680     }
1681     if (s->W(2) < s->W(idx)) {
1682         idx = 2;
1683     }
1684     if (s->W(3) < s->W(idx)) {
1685         idx = 3;
1686     }
1687     if (s->W(4) < s->W(idx)) {
1688         idx = 4;
1689     }
1690     if (s->W(5) < s->W(idx)) {
1691         idx = 5;
1692     }
1693     if (s->W(6) < s->W(idx)) {
1694         idx = 6;
1695     }
1696     if (s->W(7) < s->W(idx)) {
1697         idx = 7;
1698     }
1699 
1700     d->W(0) = s->W(idx);
1701     d->W(1) = idx;
1702     d->L(1) = 0;
1703     d->Q(1) = 0;
1704 }
1705 #endif
1706 
1707 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1708                                   uint32_t mode)
1709 {
1710     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1711     signed char prev_rounding_mode;
1712     int i;
1713 
1714     prev_rounding_mode = env->sse_status.float_rounding_mode;
1715     if (!(mode & (1 << 2))) {
1716         set_x86_rounding_mode(mode & 3, &env->sse_status);
1717     }
1718 
1719     for (i = 0; i < 2 << SHIFT; i++) {
1720         d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
1721     }
1722 
1723     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1724         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1725                                   ~float_flag_inexact,
1726                                   &env->sse_status);
1727     }
1728     env->sse_status.float_rounding_mode = prev_rounding_mode;
1729 }
1730 
1731 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1732                                   uint32_t mode)
1733 {
1734     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1735     signed char prev_rounding_mode;
1736     int i;
1737 
1738     prev_rounding_mode = env->sse_status.float_rounding_mode;
1739     if (!(mode & (1 << 2))) {
1740         set_x86_rounding_mode(mode & 3, &env->sse_status);
1741     }
1742 
1743     for (i = 0; i < 1 << SHIFT; i++) {
1744         d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
1745     }
1746 
1747     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1748         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1749                                   ~float_flag_inexact,
1750                                   &env->sse_status);
1751     }
1752     env->sse_status.float_rounding_mode = prev_rounding_mode;
1753 }
1754 
1755 #if SHIFT == 1
1756 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1757                                   uint32_t mode)
1758 {
1759     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1760     signed char prev_rounding_mode;
1761     int i;
1762 
1763     prev_rounding_mode = env->sse_status.float_rounding_mode;
1764     if (!(mode & (1 << 2))) {
1765         set_x86_rounding_mode(mode & 3, &env->sse_status);
1766     }
1767 
1768     d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1769     for (i = 1; i < 2 << SHIFT; i++) {
1770         d->ZMM_L(i) = v->ZMM_L(i);
1771     }
1772 
1773     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1774         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1775                                   ~float_flag_inexact,
1776                                   &env->sse_status);
1777     }
1778     env->sse_status.float_rounding_mode = prev_rounding_mode;
1779 }
1780 
1781 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1782                                   uint32_t mode)
1783 {
1784     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1785     signed char prev_rounding_mode;
1786     int i;
1787 
1788     prev_rounding_mode = env->sse_status.float_rounding_mode;
1789     if (!(mode & (1 << 2))) {
1790         set_x86_rounding_mode(mode & 3, &env->sse_status);
1791     }
1792 
1793     d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1794     for (i = 1; i < 1 << SHIFT; i++) {
1795         d->ZMM_Q(i) = v->ZMM_Q(i);
1796     }
1797 
1798     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1799         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1800                                   ~float_flag_inexact,
1801                                   &env->sse_status);
1802     }
1803     env->sse_status.float_rounding_mode = prev_rounding_mode;
1804 }
1805 #endif
1806 
1807 #define FBLENDP(v, s, m) (m ? s : v)
1808 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
1809 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
1810 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
1811 
1812 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1813                                uint32_t mask)
1814 {
1815     float32 prod1, prod2, temp2, temp3, temp4;
1816     int i;
1817 
1818     for (i = 0; i < 2 << SHIFT; i += 4) {
1819         /*
1820          * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
1821          * to correctly round the intermediate results
1822          */
1823         if (mask & (1 << 4)) {
1824             prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1825         } else {
1826             prod1 = float32_zero;
1827         }
1828         if (mask & (1 << 5)) {
1829             prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1830         } else {
1831             prod2 = float32_zero;
1832         }
1833         temp2 = float32_add(prod1, prod2, &env->sse_status);
1834         if (mask & (1 << 6)) {
1835             prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
1836         } else {
1837             prod1 = float32_zero;
1838         }
1839         if (mask & (1 << 7)) {
1840             prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
1841         } else {
1842             prod2 = float32_zero;
1843         }
1844         temp3 = float32_add(prod1, prod2, &env->sse_status);
1845         temp4 = float32_add(temp2, temp3, &env->sse_status);
1846 
1847         d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
1848         d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
1849         d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
1850         d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
1851     }
1852 }
1853 
1854 #if SHIFT == 1
1855 /* Oddly, there is no ymm version of dppd */
1856 void glue(helper_dppd, SUFFIX)(CPUX86State *env,
1857                                Reg *d, Reg *v, Reg *s, uint32_t mask)
1858 {
1859     float64 prod1, prod2, temp2;
1860 
1861     if (mask & (1 << 4)) {
1862         prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
1863     } else {
1864         prod1 = float64_zero;
1865     }
1866     if (mask & (1 << 5)) {
1867         prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
1868     } else {
1869         prod2 = float64_zero;
1870     }
1871     temp2 = float64_add(prod1, prod2, &env->sse_status);
1872     d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
1873     d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
1874 }
1875 #endif
1876 
1877 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
1878                                   uint32_t offset)
1879 {
1880     int i, j;
1881     uint16_t r[8];
1882 
1883     for (j = 0; j < 4 << SHIFT; ) {
1884         int s0 = (j * 2) + ((offset & 3) << 2);
1885         int d0 = (j * 2) + ((offset & 4) << 0);
1886         for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
1887             r[i] = 0;
1888             r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
1889             r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
1890             r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
1891             r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
1892         }
1893         for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
1894             d->W(j) = r[i];
1895         }
1896         offset >>= 3;
1897     }
1898 }
1899 
1900 /* SSE4.2 op helpers */
1901 #if SHIFT == 1
1902 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
1903 {
1904     target_long val, limit;
1905 
1906     /* Presence of REX.W is indicated by a bit higher than 7 set */
1907     if (ctrl >> 8) {
1908         val = (target_long)env->regs[reg];
1909     } else {
1910         val = (int32_t)env->regs[reg];
1911     }
1912     if (ctrl & 1) {
1913         limit = 8;
1914     } else {
1915         limit = 16;
1916     }
1917     if ((val > limit) || (val < -limit)) {
1918         return limit;
1919     }
1920     return abs1(val);
1921 }
1922 
1923 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
1924 {
1925     int val = 0;
1926 
1927     if (ctrl & 1) {
1928         while (val < 8 && r->W(val)) {
1929             val++;
1930         }
1931     } else {
1932         while (val < 16 && r->B(val)) {
1933             val++;
1934         }
1935     }
1936 
1937     return val;
1938 }
1939 
1940 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
1941 {
1942     switch ((ctrl >> 0) & 3) {
1943     case 0:
1944         return r->B(i);
1945     case 1:
1946         return r->W(i);
1947     case 2:
1948         return (int8_t)r->B(i);
1949     case 3:
1950     default:
1951         return (int16_t)r->W(i);
1952     }
1953 }
1954 
1955 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
1956                                  uint8_t ctrl, int valids, int validd)
1957 {
1958     unsigned int res = 0;
1959     int v;
1960     int j, i;
1961     int upper = (ctrl & 1) ? 7 : 15;
1962 
1963     valids--;
1964     validd--;
1965 
1966     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
1967 
1968     switch ((ctrl >> 2) & 3) {
1969     case 0:
1970         for (j = valids; j >= 0; j--) {
1971             res <<= 1;
1972             v = pcmp_val(s, ctrl, j);
1973             for (i = validd; i >= 0; i--) {
1974                 res |= (v == pcmp_val(d, ctrl, i));
1975             }
1976         }
1977         break;
1978     case 1:
1979         for (j = valids; j >= 0; j--) {
1980             res <<= 1;
1981             v = pcmp_val(s, ctrl, j);
1982             for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
1983                 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
1984                         pcmp_val(d, ctrl, i - 1) <= v);
1985             }
1986         }
1987         break;
1988     case 2:
1989         res = (1 << (upper - MAX(valids, validd))) - 1;
1990         res <<= MAX(valids, validd) - MIN(valids, validd);
1991         for (i = MIN(valids, validd); i >= 0; i--) {
1992             res <<= 1;
1993             v = pcmp_val(s, ctrl, i);
1994             res |= (v == pcmp_val(d, ctrl, i));
1995         }
1996         break;
1997     case 3:
1998         if (validd == -1) {
1999             res = (2 << upper) - 1;
2000             break;
2001         }
2002         for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
2003             res <<= 1;
2004             v = 1;
2005             for (i = MIN(valids - j, validd); i >= 0; i--) {
2006                 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
2007             }
2008             res |= v;
2009         }
2010         break;
2011     }
2012 
2013     switch ((ctrl >> 4) & 3) {
2014     case 1:
2015         res ^= (2 << upper) - 1;
2016         break;
2017     case 3:
2018         res ^= (1 << (valids + 1)) - 1;
2019         break;
2020     }
2021 
2022     if (res) {
2023         CC_SRC |= CC_C;
2024     }
2025     if (res & 1) {
2026         CC_SRC |= CC_O;
2027     }
2028 
2029     return res;
2030 }
2031 
2032 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2033                                     uint32_t ctrl)
2034 {
2035     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2036                                  pcmp_elen(env, R_EDX, ctrl),
2037                                  pcmp_elen(env, R_EAX, ctrl));
2038 
2039     if (res) {
2040         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2041     } else {
2042         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2043     }
2044 }
2045 
2046 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2047                                     uint32_t ctrl)
2048 {
2049     int i;
2050     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2051                                  pcmp_elen(env, R_EDX, ctrl),
2052                                  pcmp_elen(env, R_EAX, ctrl));
2053 
2054     if ((ctrl >> 6) & 1) {
2055         if (ctrl & 1) {
2056             for (i = 0; i < 8; i++, res >>= 1) {
2057                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2058             }
2059         } else {
2060             for (i = 0; i < 16; i++, res >>= 1) {
2061                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2062             }
2063         }
2064     } else {
2065         env->xmm_regs[0].Q(1) = 0;
2066         env->xmm_regs[0].Q(0) = res;
2067     }
2068 }
2069 
2070 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2071                                     uint32_t ctrl)
2072 {
2073     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2074                                  pcmp_ilen(s, ctrl),
2075                                  pcmp_ilen(d, ctrl));
2076 
2077     if (res) {
2078         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2079     } else {
2080         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2081     }
2082 }
2083 
2084 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2085                                     uint32_t ctrl)
2086 {
2087     int i;
2088     unsigned int res = pcmpxstrx(env, d, s, ctrl,
2089                                  pcmp_ilen(s, ctrl),
2090                                  pcmp_ilen(d, ctrl));
2091 
2092     if ((ctrl >> 6) & 1) {
2093         if (ctrl & 1) {
2094             for (i = 0; i < 8; i++, res >>= 1) {
2095                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2096             }
2097         } else {
2098             for (i = 0; i < 16; i++, res >>= 1) {
2099                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2100             }
2101         }
2102     } else {
2103         env->xmm_regs[0].Q(1) = 0;
2104         env->xmm_regs[0].Q(0) = res;
2105     }
2106 }
2107 
2108 #define CRCPOLY        0x1edc6f41
2109 #define CRCPOLY_BITREV 0x82f63b78
2110 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2111 {
2112     target_ulong crc = (msg & ((target_ulong) -1 >>
2113                                (TARGET_LONG_BITS - len))) ^ crc1;
2114 
2115     while (len--) {
2116         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2117     }
2118 
2119     return crc;
2120 }
2121 
2122 #endif
2123 
2124 #if SHIFT == 1
2125 static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
2126                           uint64_t a, uint64_t b)
2127 {
2128     uint64_t al, ah, resh, resl;
2129 
2130     ah = 0;
2131     al = a;
2132     resh = resl = 0;
2133 
2134     while (b) {
2135         if (b & 1) {
2136             resl ^= al;
2137             resh ^= ah;
2138         }
2139         ah = (ah << 1) | (al >> 63);
2140         al <<= 1;
2141         b >>= 1;
2142     }
2143 
2144     *dest_l = resl;
2145     *dest_h = resh;
2146 }
2147 #endif
2148 
2149 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
2150                                     uint32_t ctrl)
2151 {
2152     uint64_t a, b;
2153     int i;
2154 
2155     for (i = 0; i < 1 << SHIFT; i += 2) {
2156         a = v->Q(((ctrl & 1) != 0) + i);
2157         b = s->Q(((ctrl & 16) != 0) + i);
2158         clmulq(&d->Q(i), &d->Q(i + 1), a, b);
2159     }
2160 }
2161 
2162 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2163 {
2164     int i;
2165     Reg st = *v;
2166     Reg rk = *s;
2167 
2168     for (i = 0 ; i < 2 << SHIFT ; i++) {
2169         int j = i & 3;
2170         d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
2171                                     AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^
2172                                     AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^
2173                                     AES_Td3[st.B(AES_ishifts[4 * j + 3])]);
2174     }
2175 }
2176 
2177 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2178 {
2179     int i;
2180     Reg st = *v;
2181     Reg rk = *s;
2182 
2183     for (i = 0; i < 8 << SHIFT; i++) {
2184         d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
2185     }
2186 }
2187 
2188 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2189 {
2190     int i;
2191     Reg st = *v;
2192     Reg rk = *s;
2193 
2194     for (i = 0 ; i < 2 << SHIFT ; i++) {
2195         int j = i & 3;
2196         d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
2197                                     AES_Te1[st.B(AES_shifts[4 * j + 1])] ^
2198                                     AES_Te2[st.B(AES_shifts[4 * j + 2])] ^
2199                                     AES_Te3[st.B(AES_shifts[4 * j + 3])]);
2200     }
2201 }
2202 
2203 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2204 {
2205     int i;
2206     Reg st = *v;
2207     Reg rk = *s;
2208 
2209     for (i = 0; i < 8 << SHIFT; i++) {
2210         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
2211     }
2212 }
2213 
2214 #if SHIFT == 1
2215 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2216 {
2217     int i;
2218     Reg tmp = *s;
2219 
2220     for (i = 0 ; i < 4 ; i++) {
2221         d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
2222                           AES_imc[tmp.B(4 * i + 1)][1] ^
2223                           AES_imc[tmp.B(4 * i + 2)][2] ^
2224                           AES_imc[tmp.B(4 * i + 3)][3]);
2225     }
2226 }
2227 
2228 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2229                                           uint32_t ctrl)
2230 {
2231     int i;
2232     Reg tmp = *s;
2233 
2234     for (i = 0 ; i < 4 ; i++) {
2235         d->B(i) = AES_sbox[tmp.B(i + 4)];
2236         d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
2237     }
2238     d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2239     d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2240 }
2241 #endif
2242 #endif
2243 
2244 #if SHIFT >= 1
2245 void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2246 {
2247     uint64_t r0, r1;
2248     int i;
2249 
2250     for (i = 0; i < 1 << SHIFT; i += 2) {
2251         r0 = v->Q(i + ((s->Q(i) >> 1) & 1));
2252         r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1));
2253         d->Q(i) = r0;
2254         d->Q(i+1) = r1;
2255     }
2256 }
2257 
2258 void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2259 {
2260     uint32_t r0, r1, r2, r3;
2261     int i;
2262 
2263     for (i = 0; i < 2 << SHIFT; i += 4) {
2264         r0 = v->L(i + (s->L(i) & 3));
2265         r1 = v->L(i + (s->L(i+1) & 3));
2266         r2 = v->L(i + (s->L(i+2) & 3));
2267         r3 = v->L(i + (s->L(i+3) & 3));
2268         d->L(i) = r0;
2269         d->L(i+1) = r1;
2270         d->L(i+2) = r2;
2271         d->L(i+3) = r3;
2272     }
2273 }
2274 
2275 void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
2276 {
2277     uint64_t r0, r1;
2278     int i;
2279 
2280     for (i = 0; i < 1 << SHIFT; i += 2) {
2281         r0 = s->Q(i + ((order >> 0) & 1));
2282         r1 = s->Q(i + ((order >> 1) & 1));
2283         d->Q(i) = r0;
2284         d->Q(i+1) = r1;
2285 
2286         order >>= 2;
2287     }
2288 }
2289 
2290 void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
2291 {
2292     uint32_t r0, r1, r2, r3;
2293     int i;
2294 
2295     for (i = 0; i < 2 << SHIFT; i += 4) {
2296         r0 = s->L(i + ((order >> 0) & 3));
2297         r1 = s->L(i + ((order >> 2) & 3));
2298         r2 = s->L(i + ((order >> 4) & 3));
2299         r3 = s->L(i + ((order >> 6) & 3));
2300         d->L(i) = r0;
2301         d->L(i+1) = r1;
2302         d->L(i+2) = r2;
2303         d->L(i+3) = r3;
2304     }
2305 }
2306 
2307 #if SHIFT == 1
2308 #define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
2309 #define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
2310 #define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31))
2311 #define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
2312 #define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
2313 #define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
2314 #endif
2315 
2316 SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
2317 SSE_HELPER_L(helper_vpsravd, FPSRAVD)
2318 SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
2319 
2320 SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
2321 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
2322 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
2323 
2324 void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2325 {
2326     uint32_t zf = 0, cf = 0;
2327     int i;
2328 
2329     for (i = 0; i < 2 << SHIFT; i++) {
2330         zf |= (s->L(i) &  d->L(i));
2331         cf |= (s->L(i) & ~d->L(i));
2332     }
2333     CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
2334 }
2335 
2336 void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2337 {
2338     uint64_t zf = 0, cf = 0;
2339     int i;
2340 
2341     for (i = 0; i < 1 << SHIFT; i++) {
2342         zf |= (s->Q(i) &  d->Q(i));
2343         cf |= (s->Q(i) & ~d->Q(i));
2344     }
2345     CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
2346 }
2347 
2348 void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
2349                                         Reg *v, Reg *s, target_ulong a0)
2350 {
2351     int i;
2352 
2353     for (i = 0; i < (2 << SHIFT); i++) {
2354         if (v->L(i) >> 31) {
2355             cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
2356         }
2357     }
2358 }
2359 
2360 void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
2361                                         Reg *v, Reg *s, target_ulong a0)
2362 {
2363     int i;
2364 
2365     for (i = 0; i < (1 << SHIFT); i++) {
2366         if (v->Q(i) >> 63) {
2367             cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
2368         }
2369     }
2370 }
2371 
2372 void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2373 {
2374     int i;
2375 
2376     for (i = 0; i < (2 << SHIFT); i++) {
2377         d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
2378     }
2379 }
2380 
2381 void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
2382 {
2383     int i;
2384 
2385     for (i = 0; i < (1 << SHIFT); i++) {
2386         d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0;
2387     }
2388 }
2389 
2390 void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
2391         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2392 {
2393     int i;
2394     for (i = 0; i < (2 << SHIFT); i++) {
2395         if (v->L(i) >> 31) {
2396             target_ulong addr = a0
2397                 + ((target_ulong)(int32_t)s->L(i) << scale);
2398             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
2399         }
2400         v->L(i) = 0;
2401     }
2402 }
2403 
2404 void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env,
2405         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2406 {
2407     int i;
2408     for (i = 0; i < (1 << SHIFT); i++) {
2409         if (v->Q(i) >> 63) {
2410             target_ulong addr = a0
2411                 + ((target_ulong)(int32_t)s->L(i) << scale);
2412             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
2413         }
2414         v->Q(i) = 0;
2415     }
2416 }
2417 
2418 void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env,
2419         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2420 {
2421     int i;
2422     for (i = 0; i < (1 << SHIFT); i++) {
2423         if (v->L(i) >> 31) {
2424             target_ulong addr = a0
2425                 + ((target_ulong)(int64_t)s->Q(i) << scale);
2426             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
2427         }
2428         v->L(i) = 0;
2429     }
2430     for (i /= 2; i < 1 << SHIFT; i++) {
2431         d->Q(i) = 0;
2432         v->Q(i) = 0;
2433     }
2434 }
2435 
2436 void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env,
2437         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
2438 {
2439     int i;
2440     for (i = 0; i < (1 << SHIFT); i++) {
2441         if (v->Q(i) >> 63) {
2442             target_ulong addr = a0
2443                 + ((target_ulong)(int64_t)s->Q(i) << scale);
2444             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
2445         }
2446         v->Q(i) = 0;
2447     }
2448 }
2449 #endif
2450 
2451 #if SHIFT >= 2
2452 void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order)
2453 {
2454     uint64_t r0, r1, r2, r3;
2455 
2456     switch (order & 3) {
2457     case 0:
2458         r0 = v->Q(0);
2459         r1 = v->Q(1);
2460         break;
2461     case 1:
2462         r0 = v->Q(2);
2463         r1 = v->Q(3);
2464         break;
2465     case 2:
2466         r0 = s->Q(0);
2467         r1 = s->Q(1);
2468         break;
2469     case 3:
2470         r0 = s->Q(2);
2471         r1 = s->Q(3);
2472         break;
2473     default: /* default case added to help the compiler to avoid warnings */
2474         g_assert_not_reached();
2475     }
2476     switch ((order >> 4) & 3) {
2477     case 0:
2478         r2 = v->Q(0);
2479         r3 = v->Q(1);
2480         break;
2481     case 1:
2482         r2 = v->Q(2);
2483         r3 = v->Q(3);
2484         break;
2485     case 2:
2486         r2 = s->Q(0);
2487         r3 = s->Q(1);
2488         break;
2489     case 3:
2490         r2 = s->Q(2);
2491         r3 = s->Q(3);
2492         break;
2493     default: /* default case added to help the compiler to avoid warnings */
2494         g_assert_not_reached();
2495     }
2496     d->Q(0) = r0;
2497     d->Q(1) = r1;
2498     d->Q(2) = r2;
2499     d->Q(3) = r3;
2500 }
2501 
2502 void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order)
2503 {
2504     uint64_t r0, r1, r2, r3;
2505     r0 = s->Q(order & 3);
2506     r1 = s->Q((order >> 2) & 3);
2507     r2 = s->Q((order >> 4) & 3);
2508     r3 = s->Q((order >> 6) & 3);
2509     d->Q(0) = r0;
2510     d->Q(1) = r1;
2511     d->Q(2) = r2;
2512     d->Q(3) = r3;
2513 }
2514 
2515 void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
2516 {
2517     uint32_t r[8];
2518     int i;
2519 
2520     for (i = 0; i < 8; i++) {
2521         r[i] = s->L(v->L(i) & 7);
2522     }
2523     for (i = 0; i < 8; i++) {
2524         d->L(i) = r[i];
2525     }
2526 }
2527 #endif
2528 
2529 /* FMA3 op helpers */
2530 #if SHIFT == 1
2531 #define SSE_HELPER_FMAS(name, elem, F)                                         \
2532     void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, int flags)     \
2533     {                                                                          \
2534         d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0), flags, &env->sse_status); \
2535     }
2536 #define SSE_HELPER_FMAP(name, elem, num, F)                                    \
2537     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c,  \
2538                             int flags, int flip)                               \
2539     {                                                                          \
2540         int i;                                                                 \
2541         for (i = 0; i < num; i++) {                                            \
2542             d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i), flags, &env->sse_status); \
2543             flags ^= flip;                                                     \
2544         }                                                                      \
2545     }
2546 
2547 SSE_HELPER_FMAS(helper_fma4ss,  ZMM_S, float32_muladd)
2548 SSE_HELPER_FMAS(helper_fma4sd,  ZMM_D, float64_muladd)
2549 #endif
2550 
2551 #if SHIFT >= 1
2552 SSE_HELPER_FMAP(helper_fma4ps,  ZMM_S, 2 << SHIFT, float32_muladd)
2553 SSE_HELPER_FMAP(helper_fma4pd,  ZMM_D, 1 << SHIFT, float64_muladd)
2554 #endif
2555 
2556 #undef SSE_HELPER_S
2557 
2558 #undef LANE_WIDTH
2559 #undef SHIFT
2560 #undef XMM_ONLY
2561 #undef Reg
2562 #undef B
2563 #undef W
2564 #undef L
2565 #undef Q
2566 #undef SUFFIX
2567