xref: /qemu/tcg/i386/tcg-target.c.inc (revision 80bd81ca)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
113{
114    switch (kind) {
115    case TCG_CALL_RET_NORMAL:
116        tcg_debug_assert(slot >= 0 && slot <= 1);
117        return slot ? TCG_REG_EDX : TCG_REG_EAX;
118#ifdef _WIN64
119    case TCG_CALL_RET_BY_VEC:
120        tcg_debug_assert(slot == 0);
121        return TCG_REG_XMM0;
122#endif
123    default:
124        g_assert_not_reached();
125    }
126}
127
128/* Constants we accept.  */
129#define TCG_CT_CONST_S32 0x100
130#define TCG_CT_CONST_U32 0x200
131#define TCG_CT_CONST_I32 0x400
132#define TCG_CT_CONST_WSZ 0x800
133
134/* Registers used with L constraint, which are the first argument
135   registers on x86_64, and two random call clobbered registers on
136   i386. */
137#if TCG_TARGET_REG_BITS == 64
138# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
139# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
140#else
141# define TCG_REG_L0 TCG_REG_EAX
142# define TCG_REG_L1 TCG_REG_EDX
143#endif
144
145#define ALL_BYTEH_REGS         0x0000000fu
146#if TCG_TARGET_REG_BITS == 64
147# define ALL_GENERAL_REGS      0x0000ffffu
148# define ALL_VECTOR_REGS       0xffff0000u
149# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
150#else
151# define ALL_GENERAL_REGS      0x000000ffu
152# define ALL_VECTOR_REGS       0x00ff0000u
153# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
154#endif
155#ifdef CONFIG_SOFTMMU
156# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
157#else
158# define SOFTMMU_RESERVE_REGS  0
159#endif
160
161/* The host compiler should supply <cpuid.h> to enable runtime features
162   detection, as we're not going to go so far as our own inline assembly.
163   If not available, default values will be assumed.  */
164#if defined(CONFIG_CPUID_H)
165#include "qemu/cpuid.h"
166#endif
167
168/* For 64-bit, we always know that CMOV is available.  */
169#if TCG_TARGET_REG_BITS == 64
170# define have_cmov 1
171#elif defined(CONFIG_CPUID_H)
172static bool have_cmov;
173#else
174# define have_cmov 0
175#endif
176
177/* We need these symbols in tcg-target.h, and we can't properly conditionalize
178   it there.  Therefore we always define the variable.  */
179bool have_bmi1;
180bool have_popcnt;
181bool have_avx1;
182bool have_avx2;
183bool have_avx512bw;
184bool have_avx512dq;
185bool have_avx512vbmi2;
186bool have_avx512vl;
187bool have_movbe;
188
189#ifdef CONFIG_CPUID_H
190static bool have_bmi2;
191static bool have_lzcnt;
192#else
193# define have_bmi2 0
194# define have_lzcnt 0
195#endif
196
197static const tcg_insn_unit *tb_ret_addr;
198
199static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
200                        intptr_t value, intptr_t addend)
201{
202    value += addend;
203    switch(type) {
204    case R_386_PC32:
205        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
206        if (value != (int32_t)value) {
207            return false;
208        }
209        /* FALLTHRU */
210    case R_386_32:
211        tcg_patch32(code_ptr, value);
212        break;
213    case R_386_PC8:
214        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
215        if (value != (int8_t)value) {
216            return false;
217        }
218        tcg_patch8(code_ptr, value);
219        break;
220    default:
221        g_assert_not_reached();
222    }
223    return true;
224}
225
226/* test if a constant matches the constraint */
227static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
228{
229    if (ct & TCG_CT_CONST) {
230        return 1;
231    }
232    if (type == TCG_TYPE_I32) {
233        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
234            return 1;
235        }
236    } else {
237        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
238            return 1;
239        }
240        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
241            return 1;
242        }
243        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
244            return 1;
245        }
246    }
247    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
248        return 1;
249    }
250    return 0;
251}
252
253# define LOWREGMASK(x)	((x) & 7)
254
255#define P_EXT		0x100		/* 0x0f opcode prefix */
256#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
257#define P_DATA16        0x400           /* 0x66 opcode prefix */
258#define P_VEXW          0x1000          /* Set VEX.W = 1 */
259#if TCG_TARGET_REG_BITS == 64
260# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
261# define P_REXB_R       0x2000          /* REG field as byte register */
262# define P_REXB_RM      0x4000          /* R/M field as byte register */
263# define P_GS           0x8000          /* gs segment override */
264#else
265# define P_REXW		0
266# define P_REXB_R	0
267# define P_REXB_RM	0
268# define P_GS           0
269#endif
270#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
271#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
272#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
273#define P_VEXL          0x80000         /* Set VEX.L = 1 */
274#define P_EVEX          0x100000        /* Requires EVEX encoding */
275
276#define OPC_ARITH_EvIz	(0x81)
277#define OPC_ARITH_EvIb	(0x83)
278#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
279#define OPC_ANDN        (0xf2 | P_EXT38)
280#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
281#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
282#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
283#define OPC_BSF         (0xbc | P_EXT)
284#define OPC_BSR         (0xbd | P_EXT)
285#define OPC_BSWAP	(0xc8 | P_EXT)
286#define OPC_CALL_Jz	(0xe8)
287#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
288#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
289#define OPC_DEC_r32	(0x48)
290#define OPC_IMUL_GvEv	(0xaf | P_EXT)
291#define OPC_IMUL_GvEvIb	(0x6b)
292#define OPC_IMUL_GvEvIz	(0x69)
293#define OPC_INC_r32	(0x40)
294#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
295#define OPC_JCC_short	(0x70)		/* ... plus condition code */
296#define OPC_JMP_long	(0xe9)
297#define OPC_JMP_short	(0xeb)
298#define OPC_LEA         (0x8d)
299#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
300#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
301#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
302#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
303#define OPC_MOVB_EvIz   (0xc6)
304#define OPC_MOVL_EvIz	(0xc7)
305#define OPC_MOVL_Iv     (0xb8)
306#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
307#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
308#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
309#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
310#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
311#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
312#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
313#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
314#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
315#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
316#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
317#define OPC_MOVSBL	(0xbe | P_EXT)
318#define OPC_MOVSWL	(0xbf | P_EXT)
319#define OPC_MOVSLQ	(0x63 | P_REXW)
320#define OPC_MOVZBL	(0xb6 | P_EXT)
321#define OPC_MOVZWL	(0xb7 | P_EXT)
322#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
323#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
324#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
325#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
326#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
327#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
328#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
329#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
330#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
331#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
332#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
333#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
334#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
335#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
336#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
337#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
338#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
339#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
340#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
341#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
342#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
343#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
344#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
345#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
346#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
347#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
348#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
349#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
350#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
351#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
352#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
353#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
354#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
355#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
356#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
358#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
359#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
360#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
361#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
362#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
363#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
364#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
365#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
366#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
367#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
368#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
369#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
370#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
371#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
372#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
373#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
374#define OPC_POR         (0xeb | P_EXT | P_DATA16)
375#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
376#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
377#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
378#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
379#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
380#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
381#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
382#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
383#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
384#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
385#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
386#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
387#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
388#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
389#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
390#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
391#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
392#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
393#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
394#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
395#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
396#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
397#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
398#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
399#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
400#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
401#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
402#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
403#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
404#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
405#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
406#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
407#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
408#define OPC_POP_r32	(0x58)
409#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
410#define OPC_PUSH_r32	(0x50)
411#define OPC_PUSH_Iv	(0x68)
412#define OPC_PUSH_Ib	(0x6a)
413#define OPC_RET		(0xc3)
414#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
415#define OPC_SHIFT_1	(0xd1)
416#define OPC_SHIFT_Ib	(0xc1)
417#define OPC_SHIFT_cl	(0xd3)
418#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
419#define OPC_SHUFPS      (0xc6 | P_EXT)
420#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
421#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
422#define OPC_SHRD_Ib     (0xac | P_EXT)
423#define OPC_TESTL	(0x85)
424#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
425#define OPC_UD2         (0x0b | P_EXT)
426#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
427#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
428#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
429#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
430#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
431#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
432#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
433#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
434#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
435#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
436#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
437#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
438#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
439#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
440#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
441#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
443#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
444#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
446#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
447#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
448#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
449#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
450#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
451#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
452#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
453#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
454#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
455#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
456#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
457#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
458#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
459#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
460#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
461#define OPC_VZEROUPPER  (0x77 | P_EXT)
462#define OPC_XCHG_ax_r32	(0x90)
463#define OPC_XCHG_EvGv   (0x87)
464
465#define OPC_GRP3_Eb     (0xf6)
466#define OPC_GRP3_Ev     (0xf7)
467#define OPC_GRP5        (0xff)
468#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
469
470/* Group 1 opcode extensions for 0x80-0x83.
471   These are also used as modifiers for OPC_ARITH.  */
472#define ARITH_ADD 0
473#define ARITH_OR  1
474#define ARITH_ADC 2
475#define ARITH_SBB 3
476#define ARITH_AND 4
477#define ARITH_SUB 5
478#define ARITH_XOR 6
479#define ARITH_CMP 7
480
481/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
482#define SHIFT_ROL 0
483#define SHIFT_ROR 1
484#define SHIFT_SHL 4
485#define SHIFT_SHR 5
486#define SHIFT_SAR 7
487
488/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
489#define EXT3_TESTi 0
490#define EXT3_NOT   2
491#define EXT3_NEG   3
492#define EXT3_MUL   4
493#define EXT3_IMUL  5
494#define EXT3_DIV   6
495#define EXT3_IDIV  7
496
497/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
498#define EXT5_INC_Ev	0
499#define EXT5_DEC_Ev	1
500#define EXT5_CALLN_Ev	2
501#define EXT5_JMPN_Ev	4
502
503/* Condition codes to be added to OPC_JCC_{long,short}.  */
504#define JCC_JMP (-1)
505#define JCC_JO  0x0
506#define JCC_JNO 0x1
507#define JCC_JB  0x2
508#define JCC_JAE 0x3
509#define JCC_JE  0x4
510#define JCC_JNE 0x5
511#define JCC_JBE 0x6
512#define JCC_JA  0x7
513#define JCC_JS  0x8
514#define JCC_JNS 0x9
515#define JCC_JP  0xa
516#define JCC_JNP 0xb
517#define JCC_JL  0xc
518#define JCC_JGE 0xd
519#define JCC_JLE 0xe
520#define JCC_JG  0xf
521
522static const uint8_t tcg_cond_to_jcc[] = {
523    [TCG_COND_EQ] = JCC_JE,
524    [TCG_COND_NE] = JCC_JNE,
525    [TCG_COND_LT] = JCC_JL,
526    [TCG_COND_GE] = JCC_JGE,
527    [TCG_COND_LE] = JCC_JLE,
528    [TCG_COND_GT] = JCC_JG,
529    [TCG_COND_LTU] = JCC_JB,
530    [TCG_COND_GEU] = JCC_JAE,
531    [TCG_COND_LEU] = JCC_JBE,
532    [TCG_COND_GTU] = JCC_JA,
533};
534
535#if TCG_TARGET_REG_BITS == 64
536static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
537{
538    int rex;
539
540    if (opc & P_GS) {
541        tcg_out8(s, 0x65);
542    }
543    if (opc & P_DATA16) {
544        /* We should never be asking for both 16 and 64-bit operation.  */
545        tcg_debug_assert((opc & P_REXW) == 0);
546        tcg_out8(s, 0x66);
547    }
548    if (opc & P_SIMDF3) {
549        tcg_out8(s, 0xf3);
550    } else if (opc & P_SIMDF2) {
551        tcg_out8(s, 0xf2);
552    }
553
554    rex = 0;
555    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
556    rex |= (r & 8) >> 1;                /* REX.R */
557    rex |= (x & 8) >> 2;                /* REX.X */
558    rex |= (rm & 8) >> 3;               /* REX.B */
559
560    /* P_REXB_{R,RM} indicates that the given register is the low byte.
561       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
562       as otherwise the encoding indicates %[abcd]h.  Note that the values
563       that are ORed in merely indicate that the REX byte must be present;
564       those bits get discarded in output.  */
565    rex |= opc & (r >= 4 ? P_REXB_R : 0);
566    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
567
568    if (rex) {
569        tcg_out8(s, (uint8_t)(rex | 0x40));
570    }
571
572    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
573        tcg_out8(s, 0x0f);
574        if (opc & P_EXT38) {
575            tcg_out8(s, 0x38);
576        } else if (opc & P_EXT3A) {
577            tcg_out8(s, 0x3a);
578        }
579    }
580
581    tcg_out8(s, opc);
582}
583#else
584static void tcg_out_opc(TCGContext *s, int opc)
585{
586    if (opc & P_DATA16) {
587        tcg_out8(s, 0x66);
588    }
589    if (opc & P_SIMDF3) {
590        tcg_out8(s, 0xf3);
591    } else if (opc & P_SIMDF2) {
592        tcg_out8(s, 0xf2);
593    }
594    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
595        tcg_out8(s, 0x0f);
596        if (opc & P_EXT38) {
597            tcg_out8(s, 0x38);
598        } else if (opc & P_EXT3A) {
599            tcg_out8(s, 0x3a);
600        }
601    }
602    tcg_out8(s, opc);
603}
604/* Discard the register arguments to tcg_out_opc early, so as not to penalize
605   the 32-bit compilation paths.  This method works with all versions of gcc,
606   whereas relying on optimization may not be able to exclude them.  */
607#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
608#endif
609
610static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
611{
612    tcg_out_opc(s, opc, r, rm, 0);
613    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
614}
615
616static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
617                            int rm, int index)
618{
619    int tmp;
620
621    /* Use the two byte form if possible, which cannot encode
622       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
623    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
624        && ((rm | index) & 8) == 0) {
625        /* Two byte VEX prefix.  */
626        tcg_out8(s, 0xc5);
627
628        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
629    } else {
630        /* Three byte VEX prefix.  */
631        tcg_out8(s, 0xc4);
632
633        /* VEX.m-mmmm */
634        if (opc & P_EXT3A) {
635            tmp = 3;
636        } else if (opc & P_EXT38) {
637            tmp = 2;
638        } else if (opc & P_EXT) {
639            tmp = 1;
640        } else {
641            g_assert_not_reached();
642        }
643        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
644        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
645        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
646        tcg_out8(s, tmp);
647
648        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
649    }
650
651    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
652    /* VEX.pp */
653    if (opc & P_DATA16) {
654        tmp |= 1;                          /* 0x66 */
655    } else if (opc & P_SIMDF3) {
656        tmp |= 2;                          /* 0xf3 */
657    } else if (opc & P_SIMDF2) {
658        tmp |= 3;                          /* 0xf2 */
659    }
660    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
661    tcg_out8(s, tmp);
662    tcg_out8(s, opc);
663}
664
665static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
666                             int rm, int index)
667{
668    /* The entire 4-byte evex prefix; with R' and V' set. */
669    uint32_t p = 0x08041062;
670    int mm, pp;
671
672    tcg_debug_assert(have_avx512vl);
673
674    /* EVEX.mm */
675    if (opc & P_EXT3A) {
676        mm = 3;
677    } else if (opc & P_EXT38) {
678        mm = 2;
679    } else if (opc & P_EXT) {
680        mm = 1;
681    } else {
682        g_assert_not_reached();
683    }
684
685    /* EVEX.pp */
686    if (opc & P_DATA16) {
687        pp = 1;                          /* 0x66 */
688    } else if (opc & P_SIMDF3) {
689        pp = 2;                          /* 0xf3 */
690    } else if (opc & P_SIMDF2) {
691        pp = 3;                          /* 0xf2 */
692    } else {
693        pp = 0;
694    }
695
696    p = deposit32(p, 8, 2, mm);
697    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
698    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
699    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
700    p = deposit32(p, 16, 2, pp);
701    p = deposit32(p, 19, 4, ~v);
702    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
703    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
704
705    tcg_out32(s, p);
706    tcg_out8(s, opc);
707}
708
709static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
710{
711    if (opc & P_EVEX) {
712        tcg_out_evex_opc(s, opc, r, v, rm, 0);
713    } else {
714        tcg_out_vex_opc(s, opc, r, v, rm, 0);
715    }
716    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
717}
718
719/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
720   We handle either RM and INDEX missing with a negative value.  In 64-bit
721   mode for absolute addresses, ~RM is the size of the immediate operand
722   that will follow the instruction.  */
723
724static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
725                               int shift, intptr_t offset)
726{
727    int mod, len;
728
729    if (index < 0 && rm < 0) {
730        if (TCG_TARGET_REG_BITS == 64) {
731            /* Try for a rip-relative addressing mode.  This has replaced
732               the 32-bit-mode absolute addressing encoding.  */
733            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
734            intptr_t disp = offset - pc;
735            if (disp == (int32_t)disp) {
736                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
737                tcg_out32(s, disp);
738                return;
739            }
740
741            /* Try for an absolute address encoding.  This requires the
742               use of the MODRM+SIB encoding and is therefore larger than
743               rip-relative addressing.  */
744            if (offset == (int32_t)offset) {
745                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
746                tcg_out8(s, (4 << 3) | 5);
747                tcg_out32(s, offset);
748                return;
749            }
750
751            /* ??? The memory isn't directly addressable.  */
752            g_assert_not_reached();
753        } else {
754            /* Absolute address.  */
755            tcg_out8(s, (r << 3) | 5);
756            tcg_out32(s, offset);
757            return;
758        }
759    }
760
761    /* Find the length of the immediate addend.  Note that the encoding
762       that would be used for (%ebp) indicates absolute addressing.  */
763    if (rm < 0) {
764        mod = 0, len = 4, rm = 5;
765    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
766        mod = 0, len = 0;
767    } else if (offset == (int8_t)offset) {
768        mod = 0x40, len = 1;
769    } else {
770        mod = 0x80, len = 4;
771    }
772
773    /* Use a single byte MODRM format if possible.  Note that the encoding
774       that would be used for %esp is the escape to the two byte form.  */
775    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
776        /* Single byte MODRM format.  */
777        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
778    } else {
779        /* Two byte MODRM+SIB format.  */
780
781        /* Note that the encoding that would place %esp into the index
782           field indicates no index register.  In 64-bit mode, the REX.X
783           bit counts, so %r12 can be used as the index.  */
784        if (index < 0) {
785            index = 4;
786        } else {
787            tcg_debug_assert(index != TCG_REG_ESP);
788        }
789
790        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
791        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
792    }
793
794    if (len == 1) {
795        tcg_out8(s, offset);
796    } else if (len == 4) {
797        tcg_out32(s, offset);
798    }
799}
800
801static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
802                                     int index, int shift, intptr_t offset)
803{
804    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
805    tcg_out_sib_offset(s, r, rm, index, shift, offset);
806}
807
808static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
809                                         int rm, int index, int shift,
810                                         intptr_t offset)
811{
812    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
813    tcg_out_sib_offset(s, r, rm, index, shift, offset);
814}
815
816/* A simplification of the above with no index or shift.  */
817static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
818                                        int rm, intptr_t offset)
819{
820    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
821}
822
823static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
824                                            int v, int rm, intptr_t offset)
825{
826    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
827}
828
829/* Output an opcode with an expected reference to the constant pool.  */
830static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
831{
832    tcg_out_opc(s, opc, r, 0, 0);
833    /* Absolute for 32-bit, pc-relative for 64-bit.  */
834    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
835    tcg_out32(s, 0);
836}
837
838/* Output an opcode with an expected reference to the constant pool.  */
839static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
840{
841    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
842    /* Absolute for 32-bit, pc-relative for 64-bit.  */
843    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
844    tcg_out32(s, 0);
845}
846
847/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
848static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
849{
850    /* Propagate an opcode prefix, such as P_REXW.  */
851    int ext = subop & ~0x7;
852    subop &= 0x7;
853
854    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
855}
856
857static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
858{
859    int rexw = 0;
860
861    if (arg == ret) {
862        return true;
863    }
864    switch (type) {
865    case TCG_TYPE_I64:
866        rexw = P_REXW;
867        /* fallthru */
868    case TCG_TYPE_I32:
869        if (ret < 16) {
870            if (arg < 16) {
871                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
872            } else {
873                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
874            }
875        } else {
876            if (arg < 16) {
877                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
878            } else {
879                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
880            }
881        }
882        break;
883
884    case TCG_TYPE_V64:
885        tcg_debug_assert(ret >= 16 && arg >= 16);
886        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
887        break;
888    case TCG_TYPE_V128:
889        tcg_debug_assert(ret >= 16 && arg >= 16);
890        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
891        break;
892    case TCG_TYPE_V256:
893        tcg_debug_assert(ret >= 16 && arg >= 16);
894        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
895        break;
896
897    default:
898        g_assert_not_reached();
899    }
900    return true;
901}
902
903static const int avx2_dup_insn[4] = {
904    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
905    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
906};
907
908static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
909                            TCGReg r, TCGReg a)
910{
911    if (have_avx2) {
912        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
913        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
914    } else {
915        switch (vece) {
916        case MO_8:
917            /* ??? With zero in a register, use PSHUFB.  */
918            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
919            a = r;
920            /* FALLTHRU */
921        case MO_16:
922            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
923            a = r;
924            /* FALLTHRU */
925        case MO_32:
926            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
927            /* imm8 operand: all output lanes selected from input lane 0.  */
928            tcg_out8(s, 0);
929            break;
930        case MO_64:
931            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
932            break;
933        default:
934            g_assert_not_reached();
935        }
936    }
937    return true;
938}
939
940static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
941                             TCGReg r, TCGReg base, intptr_t offset)
942{
943    if (have_avx2) {
944        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
945        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
946                                 r, 0, base, offset);
947    } else {
948        switch (vece) {
949        case MO_64:
950            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
951            break;
952        case MO_32:
953            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
954            break;
955        case MO_16:
956            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
957            tcg_out8(s, 0); /* imm8 */
958            tcg_out_dup_vec(s, type, vece, r, r);
959            break;
960        case MO_8:
961            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
962            tcg_out8(s, 0); /* imm8 */
963            tcg_out_dup_vec(s, type, vece, r, r);
964            break;
965        default:
966            g_assert_not_reached();
967        }
968    }
969    return true;
970}
971
972static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
973                             TCGReg ret, int64_t arg)
974{
975    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
976
977    if (arg == 0) {
978        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
979        return;
980    }
981    if (arg == -1) {
982        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
983        return;
984    }
985
986    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
987        if (have_avx2) {
988            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
989        } else {
990            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
991        }
992        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
993    } else {
994        if (type == TCG_TYPE_V64) {
995            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
996        } else if (have_avx2) {
997            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
998        } else {
999            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1000        }
1001        if (TCG_TARGET_REG_BITS == 64) {
1002            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1003        } else {
1004            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1005        }
1006    }
1007}
1008
1009static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1010                             TCGReg ret, tcg_target_long arg)
1011{
1012    if (arg == 0) {
1013        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1014        return;
1015    }
1016    if (arg == -1) {
1017        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1018        return;
1019    }
1020
1021    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1022    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1023    if (TCG_TARGET_REG_BITS == 64) {
1024        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1025    } else {
1026        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1027    }
1028}
1029
1030static void tcg_out_movi_int(TCGContext *s, TCGType type,
1031                             TCGReg ret, tcg_target_long arg)
1032{
1033    tcg_target_long diff;
1034
1035    if (arg == 0) {
1036        tgen_arithr(s, ARITH_XOR, ret, ret);
1037        return;
1038    }
1039    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1040        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1041        tcg_out32(s, arg);
1042        return;
1043    }
1044    if (arg == (int32_t)arg) {
1045        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1046        tcg_out32(s, arg);
1047        return;
1048    }
1049
1050    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1051    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1052    if (diff == (int32_t)diff) {
1053        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1054        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1055        tcg_out32(s, diff);
1056        return;
1057    }
1058
1059    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1060    tcg_out64(s, arg);
1061}
1062
1063static void tcg_out_movi(TCGContext *s, TCGType type,
1064                         TCGReg ret, tcg_target_long arg)
1065{
1066    switch (type) {
1067    case TCG_TYPE_I32:
1068#if TCG_TARGET_REG_BITS == 64
1069    case TCG_TYPE_I64:
1070#endif
1071        if (ret < 16) {
1072            tcg_out_movi_int(s, type, ret, arg);
1073        } else {
1074            tcg_out_movi_vec(s, type, ret, arg);
1075        }
1076        break;
1077    default:
1078        g_assert_not_reached();
1079    }
1080}
1081
1082static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1083{
1084    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1085    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1086    return true;
1087}
1088
1089static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1090                             tcg_target_long imm)
1091{
1092    /* This function is only used for passing structs by reference. */
1093    tcg_debug_assert(imm == (int32_t)imm);
1094    tcg_out_modrm_offset(s, OPC_LEA, rd, rs, imm);
1095}
1096
1097static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1098{
1099    if (val == (int8_t)val) {
1100        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1101        tcg_out8(s, val);
1102    } else if (val == (int32_t)val) {
1103        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1104        tcg_out32(s, val);
1105    } else {
1106        g_assert_not_reached();
1107    }
1108}
1109
1110static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1111{
1112    /* Given the strength of x86 memory ordering, we only need care for
1113       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1114       faster than "mfence", so don't bother with the sse insn.  */
1115    if (a0 & TCG_MO_ST_LD) {
1116        tcg_out8(s, 0xf0);
1117        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1118        tcg_out8(s, 0);
1119    }
1120}
1121
1122static inline void tcg_out_push(TCGContext *s, int reg)
1123{
1124    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1125}
1126
1127static inline void tcg_out_pop(TCGContext *s, int reg)
1128{
1129    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1130}
1131
1132static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1133                       TCGReg arg1, intptr_t arg2)
1134{
1135    switch (type) {
1136    case TCG_TYPE_I32:
1137        if (ret < 16) {
1138            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1139        } else {
1140            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1141        }
1142        break;
1143    case TCG_TYPE_I64:
1144        if (ret < 16) {
1145            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1146            break;
1147        }
1148        /* FALLTHRU */
1149    case TCG_TYPE_V64:
1150        /* There is no instruction that can validate 8-byte alignment.  */
1151        tcg_debug_assert(ret >= 16);
1152        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1153        break;
1154    case TCG_TYPE_V128:
1155        /*
1156         * The gvec infrastructure is asserts that v128 vector loads
1157         * and stores use a 16-byte aligned offset.  Validate that the
1158         * final pointer is aligned by using an insn that will SIGSEGV.
1159         */
1160        tcg_debug_assert(ret >= 16);
1161        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1162        break;
1163    case TCG_TYPE_V256:
1164        /*
1165         * The gvec infrastructure only requires 16-byte alignment,
1166         * so here we must use an unaligned load.
1167         */
1168        tcg_debug_assert(ret >= 16);
1169        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1170                                 ret, 0, arg1, arg2);
1171        break;
1172    default:
1173        g_assert_not_reached();
1174    }
1175}
1176
1177static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1178                       TCGReg arg1, intptr_t arg2)
1179{
1180    switch (type) {
1181    case TCG_TYPE_I32:
1182        if (arg < 16) {
1183            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1184        } else {
1185            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1186        }
1187        break;
1188    case TCG_TYPE_I64:
1189        if (arg < 16) {
1190            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1191            break;
1192        }
1193        /* FALLTHRU */
1194    case TCG_TYPE_V64:
1195        /* There is no instruction that can validate 8-byte alignment.  */
1196        tcg_debug_assert(arg >= 16);
1197        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1198        break;
1199    case TCG_TYPE_V128:
1200        /*
1201         * The gvec infrastructure is asserts that v128 vector loads
1202         * and stores use a 16-byte aligned offset.  Validate that the
1203         * final pointer is aligned by using an insn that will SIGSEGV.
1204         *
1205         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1206         * for _WIN64, which must have SSE2 but may not have AVX.
1207         */
1208        tcg_debug_assert(arg >= 16);
1209        if (have_avx1) {
1210            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1211        } else {
1212            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1213        }
1214        break;
1215    case TCG_TYPE_V256:
1216        /*
1217         * The gvec infrastructure only requires 16-byte alignment,
1218         * so here we must use an unaligned store.
1219         */
1220        tcg_debug_assert(arg >= 16);
1221        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1222                                 arg, 0, arg1, arg2);
1223        break;
1224    default:
1225        g_assert_not_reached();
1226    }
1227}
1228
1229static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1230                        TCGReg base, intptr_t ofs)
1231{
1232    int rexw = 0;
1233    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1234        if (val != (int32_t)val) {
1235            return false;
1236        }
1237        rexw = P_REXW;
1238    } else if (type != TCG_TYPE_I32) {
1239        return false;
1240    }
1241    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1242    tcg_out32(s, val);
1243    return true;
1244}
1245
1246static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1247{
1248    /* Propagate an opcode prefix, such as P_DATA16.  */
1249    int ext = subopc & ~0x7;
1250    subopc &= 0x7;
1251
1252    if (count == 1) {
1253        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1254    } else {
1255        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1256        tcg_out8(s, count);
1257    }
1258}
1259
1260static inline void tcg_out_bswap32(TCGContext *s, int reg)
1261{
1262    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1263}
1264
1265static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1266{
1267    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1268}
1269
1270static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1271{
1272    /* movzbl */
1273    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1274    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1275}
1276
1277static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1278{
1279    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1280    /* movsbl */
1281    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1282    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1283}
1284
1285static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1286{
1287    /* movzwl */
1288    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1289}
1290
1291static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1292{
1293    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1294    /* movsw[lq] */
1295    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1296}
1297
1298static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1299{
1300    /* 32-bit mov zero extends.  */
1301    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1302}
1303
1304static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1305{
1306    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1307    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1308}
1309
1310static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1311{
1312    tcg_out_ext32s(s, dest, src);
1313}
1314
1315static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1316{
1317    tcg_out_ext32u(s, dest, src);
1318}
1319
1320static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1321{
1322    tcg_out_ext32u(s, dest, src);
1323}
1324
1325static inline void tcg_out_bswap64(TCGContext *s, int reg)
1326{
1327    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1328}
1329
1330static void tgen_arithi(TCGContext *s, int c, int r0,
1331                        tcg_target_long val, int cf)
1332{
1333    int rexw = 0;
1334
1335    if (TCG_TARGET_REG_BITS == 64) {
1336        rexw = c & -8;
1337        c &= 7;
1338    }
1339
1340    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1341       partial flags update stalls on Pentium4 and are not recommended
1342       by current Intel optimization manuals.  */
1343    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1344        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1345        if (TCG_TARGET_REG_BITS == 64) {
1346            /* The single-byte increment encodings are re-tasked as the
1347               REX prefixes.  Use the MODRM encoding.  */
1348            tcg_out_modrm(s, OPC_GRP5 + rexw,
1349                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1350        } else {
1351            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1352        }
1353        return;
1354    }
1355
1356    if (c == ARITH_AND) {
1357        if (TCG_TARGET_REG_BITS == 64) {
1358            if (val == 0xffffffffu) {
1359                tcg_out_ext32u(s, r0, r0);
1360                return;
1361            }
1362            if (val == (uint32_t)val) {
1363                /* AND with no high bits set can use a 32-bit operation.  */
1364                rexw = 0;
1365            }
1366        }
1367        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1368            tcg_out_ext8u(s, r0, r0);
1369            return;
1370        }
1371        if (val == 0xffffu) {
1372            tcg_out_ext16u(s, r0, r0);
1373            return;
1374        }
1375    }
1376
1377    if (val == (int8_t)val) {
1378        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1379        tcg_out8(s, val);
1380        return;
1381    }
1382    if (rexw == 0 || val == (int32_t)val) {
1383        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1384        tcg_out32(s, val);
1385        return;
1386    }
1387
1388    g_assert_not_reached();
1389}
1390
1391static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1392{
1393    if (val != 0) {
1394        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1395    }
1396}
1397
1398/* Set SMALL to force a short forward branch.  */
1399static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1400{
1401    int32_t val, val1;
1402
1403    if (l->has_value) {
1404        val = tcg_pcrel_diff(s, l->u.value_ptr);
1405        val1 = val - 2;
1406        if ((int8_t)val1 == val1) {
1407            if (opc == -1) {
1408                tcg_out8(s, OPC_JMP_short);
1409            } else {
1410                tcg_out8(s, OPC_JCC_short + opc);
1411            }
1412            tcg_out8(s, val1);
1413        } else {
1414            tcg_debug_assert(!small);
1415            if (opc == -1) {
1416                tcg_out8(s, OPC_JMP_long);
1417                tcg_out32(s, val - 5);
1418            } else {
1419                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1420                tcg_out32(s, val - 6);
1421            }
1422        }
1423    } else if (small) {
1424        if (opc == -1) {
1425            tcg_out8(s, OPC_JMP_short);
1426        } else {
1427            tcg_out8(s, OPC_JCC_short + opc);
1428        }
1429        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1430        s->code_ptr += 1;
1431    } else {
1432        if (opc == -1) {
1433            tcg_out8(s, OPC_JMP_long);
1434        } else {
1435            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1436        }
1437        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1438        s->code_ptr += 4;
1439    }
1440}
1441
1442static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1443                        int const_arg2, int rexw)
1444{
1445    if (const_arg2) {
1446        if (arg2 == 0) {
1447            /* test r, r */
1448            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1449        } else {
1450            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1451        }
1452    } else {
1453        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1454    }
1455}
1456
1457static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1458                             TCGArg arg1, TCGArg arg2, int const_arg2,
1459                             TCGLabel *label, int small)
1460{
1461    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1462    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1463}
1464
1465#if TCG_TARGET_REG_BITS == 64
1466static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1467                             TCGArg arg1, TCGArg arg2, int const_arg2,
1468                             TCGLabel *label, int small)
1469{
1470    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1471    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1472}
1473#else
1474/* XXX: we implement it at the target level to avoid having to
1475   handle cross basic blocks temporaries */
1476static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1477                            const int *const_args, int small)
1478{
1479    TCGLabel *label_next = gen_new_label();
1480    TCGLabel *label_this = arg_label(args[5]);
1481
1482    switch(args[4]) {
1483    case TCG_COND_EQ:
1484        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1485                         label_next, 1);
1486        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1487                         label_this, small);
1488        break;
1489    case TCG_COND_NE:
1490        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1491                         label_this, small);
1492        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1493                         label_this, small);
1494        break;
1495    case TCG_COND_LT:
1496        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1497                         label_this, small);
1498        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1500                         label_this, small);
1501        break;
1502    case TCG_COND_LE:
1503        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1504                         label_this, small);
1505        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1507                         label_this, small);
1508        break;
1509    case TCG_COND_GT:
1510        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1511                         label_this, small);
1512        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1514                         label_this, small);
1515        break;
1516    case TCG_COND_GE:
1517        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1518                         label_this, small);
1519        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1521                         label_this, small);
1522        break;
1523    case TCG_COND_LTU:
1524        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1525                         label_this, small);
1526        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1527        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1528                         label_this, small);
1529        break;
1530    case TCG_COND_LEU:
1531        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1532                         label_this, small);
1533        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1534        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1535                         label_this, small);
1536        break;
1537    case TCG_COND_GTU:
1538        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1539                         label_this, small);
1540        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1541        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1542                         label_this, small);
1543        break;
1544    case TCG_COND_GEU:
1545        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1546                         label_this, small);
1547        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1548        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1549                         label_this, small);
1550        break;
1551    default:
1552        g_assert_not_reached();
1553    }
1554    tcg_out_label(s, label_next);
1555}
1556#endif
1557
1558static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1559                              TCGArg arg1, TCGArg arg2, int const_arg2)
1560{
1561    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1562    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1563    tcg_out_ext8u(s, dest, dest);
1564}
1565
1566#if TCG_TARGET_REG_BITS == 64
1567static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1568                              TCGArg arg1, TCGArg arg2, int const_arg2)
1569{
1570    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1571    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1572    tcg_out_ext8u(s, dest, dest);
1573}
1574#else
1575static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1576                             const int *const_args)
1577{
1578    TCGArg new_args[6];
1579    TCGLabel *label_true, *label_over;
1580
1581    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1582
1583    if (args[0] == args[1] || args[0] == args[2]
1584        || (!const_args[3] && args[0] == args[3])
1585        || (!const_args[4] && args[0] == args[4])) {
1586        /* When the destination overlaps with one of the argument
1587           registers, don't do anything tricky.  */
1588        label_true = gen_new_label();
1589        label_over = gen_new_label();
1590
1591        new_args[5] = label_arg(label_true);
1592        tcg_out_brcond2(s, new_args, const_args+1, 1);
1593
1594        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1595        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1596        tcg_out_label(s, label_true);
1597
1598        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1599        tcg_out_label(s, label_over);
1600    } else {
1601        /* When the destination does not overlap one of the arguments,
1602           clear the destination first, jump if cond false, and emit an
1603           increment in the true case.  This results in smaller code.  */
1604
1605        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1606
1607        label_over = gen_new_label();
1608        new_args[4] = tcg_invert_cond(new_args[4]);
1609        new_args[5] = label_arg(label_over);
1610        tcg_out_brcond2(s, new_args, const_args+1, 1);
1611
1612        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1613        tcg_out_label(s, label_over);
1614    }
1615}
1616#endif
1617
1618static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1619                         TCGReg dest, TCGReg v1)
1620{
1621    if (have_cmov) {
1622        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1623    } else {
1624        TCGLabel *over = gen_new_label();
1625        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1626        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1627        tcg_out_label(s, over);
1628    }
1629}
1630
1631static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1632                              TCGReg c1, TCGArg c2, int const_c2,
1633                              TCGReg v1)
1634{
1635    tcg_out_cmp(s, c1, c2, const_c2, 0);
1636    tcg_out_cmov(s, cond, 0, dest, v1);
1637}
1638
1639#if TCG_TARGET_REG_BITS == 64
1640static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1641                              TCGReg c1, TCGArg c2, int const_c2,
1642                              TCGReg v1)
1643{
1644    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1645    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1646}
1647#endif
1648
1649static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1650                        TCGArg arg2, bool const_a2)
1651{
1652    if (have_bmi1) {
1653        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1654        if (const_a2) {
1655            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1656        } else {
1657            tcg_debug_assert(dest != arg2);
1658            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1659        }
1660    } else {
1661        tcg_debug_assert(dest != arg2);
1662        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1663        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1664    }
1665}
1666
1667static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1668                        TCGArg arg2, bool const_a2)
1669{
1670    if (have_lzcnt) {
1671        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1672        if (const_a2) {
1673            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1674        } else {
1675            tcg_debug_assert(dest != arg2);
1676            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1677        }
1678    } else {
1679        tcg_debug_assert(!const_a2);
1680        tcg_debug_assert(dest != arg1);
1681        tcg_debug_assert(dest != arg2);
1682
1683        /* Recall that the output of BSR is the index not the count.  */
1684        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1685        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1686
1687        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1688        tcg_out_cmp(s, arg1, 0, 1, rexw);
1689        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1690    }
1691}
1692
1693static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1694{
1695    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1696
1697    if (disp == (int32_t)disp) {
1698        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1699        tcg_out32(s, disp);
1700    } else {
1701        /* rip-relative addressing into the constant pool.
1702           This is 6 + 8 = 14 bytes, as compared to using an
1703           immediate load 10 + 6 = 16 bytes, plus we may
1704           be able to re-use the pool constant for more calls.  */
1705        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1706        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1707        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1708        tcg_out32(s, 0);
1709    }
1710}
1711
1712static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1713                         const TCGHelperInfo *info)
1714{
1715    tcg_out_branch(s, 1, dest);
1716
1717#ifndef _WIN32
1718    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1719        /*
1720         * The sysv i386 abi for struct return places a reference as the
1721         * first argument of the stack, and pops that argument with the
1722         * return statement.  Since we want to retain the aligned stack
1723         * pointer for the callee, we do not want to actually push that
1724         * argument before the call but rely on the normal store to the
1725         * stack slot.  But we do need to compensate for the pop in order
1726         * to reset our correct stack pointer value.
1727         * Pushing a garbage value back onto the stack is quickest.
1728         */
1729        tcg_out_push(s, TCG_REG_EAX);
1730    }
1731#endif
1732}
1733
1734static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1735{
1736    tcg_out_branch(s, 0, dest);
1737}
1738
1739static void tcg_out_nopn(TCGContext *s, int n)
1740{
1741    int i;
1742    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1743     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1744     * duplicate prefix, and all of the interesting recent cores can
1745     * decode and discard the duplicates in a single cycle.
1746     */
1747    tcg_debug_assert(n >= 1);
1748    for (i = 1; i < n; ++i) {
1749        tcg_out8(s, 0x66);
1750    }
1751    tcg_out8(s, 0x90);
1752}
1753
1754/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1755static void __attribute__((unused))
1756tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1757{
1758    /*
1759     * This is used for testing alignment, so we can usually use testb.
1760     * For i686, we have to use testl for %esi/%edi.
1761     */
1762    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1763        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1764        tcg_out8(s, i);
1765    } else {
1766        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1767        tcg_out32(s, i);
1768    }
1769}
1770
1771typedef struct {
1772    TCGReg base;
1773    int index;
1774    int ofs;
1775    int seg;
1776} HostAddress;
1777
1778#if defined(CONFIG_SOFTMMU)
1779/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1780 *                                     int mmu_idx, uintptr_t ra)
1781 */
1782static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1783    [MO_UB]   = helper_ret_ldub_mmu,
1784    [MO_LEUW] = helper_le_lduw_mmu,
1785    [MO_LEUL] = helper_le_ldul_mmu,
1786    [MO_LEUQ] = helper_le_ldq_mmu,
1787    [MO_BEUW] = helper_be_lduw_mmu,
1788    [MO_BEUL] = helper_be_ldul_mmu,
1789    [MO_BEUQ] = helper_be_ldq_mmu,
1790};
1791
1792/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1793 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1794 */
1795static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1796    [MO_UB]   = helper_ret_stb_mmu,
1797    [MO_LEUW] = helper_le_stw_mmu,
1798    [MO_LEUL] = helper_le_stl_mmu,
1799    [MO_LEUQ] = helper_le_stq_mmu,
1800    [MO_BEUW] = helper_be_stw_mmu,
1801    [MO_BEUL] = helper_be_stl_mmu,
1802    [MO_BEUQ] = helper_be_stq_mmu,
1803};
1804
1805/*
1806 * Because i686 has no register parameters and because x86_64 has xchg
1807 * to handle addr/data register overlap, we have placed all input arguments
1808 * before we need might need a scratch reg.
1809 *
1810 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1811 * a general-purpose scratch when we don't actually know it's available,
1812 * use the ra_gen hook to load into RAX if needed.
1813 */
1814#if TCG_TARGET_REG_BITS == 64
1815static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1816{
1817    if (arg < 0) {
1818        arg = TCG_REG_RAX;
1819    }
1820    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1821    return arg;
1822}
1823static const TCGLdstHelperParam ldst_helper_param = {
1824    .ra_gen = ldst_ra_gen
1825};
1826#else
1827static const TCGLdstHelperParam ldst_helper_param = { };
1828#endif
1829
1830/*
1831 * Generate code for the slow path for a load at the end of block
1832 */
1833static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1834{
1835    MemOp opc = get_memop(l->oi);
1836    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1837
1838    /* resolve label address */
1839    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1840    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1841        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1842    }
1843
1844    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1845    tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1846    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1847
1848    tcg_out_jmp(s, l->raddr);
1849    return true;
1850}
1851
1852/*
1853 * Generate code for the slow path for a store at the end of block
1854 */
1855static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856{
1857    MemOp opc = get_memop(l->oi);
1858    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1859
1860    /* resolve label address */
1861    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1862    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1863        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1864    }
1865
1866    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1867    tcg_out_branch(s, 1, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1868
1869    tcg_out_jmp(s, l->raddr);
1870    return true;
1871}
1872#else
1873static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
1874{
1875    /* resolve label address */
1876    tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
1877
1878    if (TCG_TARGET_REG_BITS == 32) {
1879        int ofs = 0;
1880
1881        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1882        ofs += 4;
1883
1884        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1885        ofs += 4;
1886        if (TARGET_LONG_BITS == 64) {
1887            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1888            ofs += 4;
1889        }
1890
1891        tcg_out_pushi(s, (uintptr_t)l->raddr);
1892    } else {
1893        tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
1894                    l->addrlo_reg);
1895        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1896
1897        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
1898        tcg_out_push(s, TCG_REG_RAX);
1899    }
1900
1901    /* "Tail call" to the helper, with the return address back inline. */
1902    tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
1903                                  : helper_unaligned_st));
1904    return true;
1905}
1906
1907static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1908{
1909    return tcg_out_fail_alignment(s, l);
1910}
1911
1912static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1913{
1914    return tcg_out_fail_alignment(s, l);
1915}
1916
1917static HostAddress x86_guest_base = {
1918    .index = -1
1919};
1920
1921#if defined(__x86_64__) && defined(__linux__)
1922# include <asm/prctl.h>
1923# include <sys/prctl.h>
1924int arch_prctl(int code, unsigned long addr);
1925static inline int setup_guest_base_seg(void)
1926{
1927    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1928        return P_GS;
1929    }
1930    return 0;
1931}
1932#elif defined(__x86_64__) && \
1933      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1934# include <machine/sysarch.h>
1935static inline int setup_guest_base_seg(void)
1936{
1937    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1938        return P_GS;
1939    }
1940    return 0;
1941}
1942#else
1943static inline int setup_guest_base_seg(void)
1944{
1945    return 0;
1946}
1947#endif /* setup_guest_base_seg */
1948#endif /* SOFTMMU */
1949
1950/*
1951 * For softmmu, perform the TLB load and compare.
1952 * For useronly, perform any required alignment tests.
1953 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1954 * is required and fill in @h with the host address for the fast path.
1955 */
1956static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1957                                           TCGReg addrlo, TCGReg addrhi,
1958                                           MemOpIdx oi, bool is_ld)
1959{
1960    TCGLabelQemuLdst *ldst = NULL;
1961    MemOp opc = get_memop(oi);
1962    unsigned a_bits = get_alignment_bits(opc);
1963    unsigned a_mask = (1 << a_bits) - 1;
1964
1965#ifdef CONFIG_SOFTMMU
1966    int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1967                        : offsetof(CPUTLBEntry, addr_write);
1968    TCGType ttype = TCG_TYPE_I32;
1969    TCGType tlbtype = TCG_TYPE_I32;
1970    int trexw = 0, hrexw = 0, tlbrexw = 0;
1971    unsigned mem_index = get_mmuidx(oi);
1972    unsigned s_bits = opc & MO_SIZE;
1973    unsigned s_mask = (1 << s_bits) - 1;
1974    target_ulong tlb_mask;
1975
1976    ldst = new_ldst_label(s);
1977    ldst->is_ld = is_ld;
1978    ldst->oi = oi;
1979    ldst->addrlo_reg = addrlo;
1980    ldst->addrhi_reg = addrhi;
1981
1982    if (TCG_TARGET_REG_BITS == 64) {
1983        if (TARGET_LONG_BITS == 64) {
1984            ttype = TCG_TYPE_I64;
1985            trexw = P_REXW;
1986        }
1987        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1988            hrexw = P_REXW;
1989            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1990                tlbtype = TCG_TYPE_I64;
1991                tlbrexw = P_REXW;
1992            }
1993        }
1994    }
1995
1996    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1997    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1998                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1999
2000    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2001                         TLB_MASK_TABLE_OFS(mem_index) +
2002                         offsetof(CPUTLBDescFast, mask));
2003
2004    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2005                         TLB_MASK_TABLE_OFS(mem_index) +
2006                         offsetof(CPUTLBDescFast, table));
2007
2008    /*
2009     * If the required alignment is at least as large as the access, simply
2010     * copy the address and mask.  For lesser alignments, check that we don't
2011     * cross pages for the complete access.
2012     */
2013    if (a_bits >= s_bits) {
2014        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2015    } else {
2016        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2017                             addrlo, s_mask - a_mask);
2018    }
2019    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
2020    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2021
2022    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2023    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2024                         TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2025
2026    /* jne slow_path */
2027    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2028    ldst->label_ptr[0] = s->code_ptr;
2029    s->code_ptr += 4;
2030
2031    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
2032        /* cmp 4(TCG_REG_L0), addrhi */
2033        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
2034
2035        /* jne slow_path */
2036        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2037        ldst->label_ptr[1] = s->code_ptr;
2038        s->code_ptr += 4;
2039    }
2040
2041    /* TLB Hit.  */
2042    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2043               offsetof(CPUTLBEntry, addend));
2044
2045    *h = (HostAddress) {
2046        .base = addrlo,
2047        .index = TCG_REG_L0,
2048    };
2049#else
2050    if (a_bits) {
2051        ldst = new_ldst_label(s);
2052
2053        ldst->is_ld = is_ld;
2054        ldst->oi = oi;
2055        ldst->addrlo_reg = addrlo;
2056        ldst->addrhi_reg = addrhi;
2057
2058        tcg_out_testi(s, addrlo, a_mask);
2059        /* jne slow_path */
2060        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2061        ldst->label_ptr[0] = s->code_ptr;
2062        s->code_ptr += 4;
2063    }
2064
2065    *h = x86_guest_base;
2066    h->base = addrlo;
2067#endif
2068
2069    return ldst;
2070}
2071
2072static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2073                                   HostAddress h, TCGType type, MemOp memop)
2074{
2075    bool use_movbe = false;
2076    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2077    int movop = OPC_MOVL_GvEv;
2078
2079    /* Do big-endian loads with movbe.  */
2080    if (memop & MO_BSWAP) {
2081        tcg_debug_assert(have_movbe);
2082        use_movbe = true;
2083        movop = OPC_MOVBE_GyMy;
2084    }
2085
2086    switch (memop & MO_SSIZE) {
2087    case MO_UB:
2088        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2089                                 h.base, h.index, 0, h.ofs);
2090        break;
2091    case MO_SB:
2092        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2093                                 h.base, h.index, 0, h.ofs);
2094        break;
2095    case MO_UW:
2096        if (use_movbe) {
2097            /* There is no extending movbe; only low 16-bits are modified.  */
2098            if (datalo != h.base && datalo != h.index) {
2099                /* XOR breaks dependency chains.  */
2100                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2101                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2102                                         datalo, h.base, h.index, 0, h.ofs);
2103            } else {
2104                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2105                                         datalo, h.base, h.index, 0, h.ofs);
2106                tcg_out_ext16u(s, datalo, datalo);
2107            }
2108        } else {
2109            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2110                                     h.base, h.index, 0, h.ofs);
2111        }
2112        break;
2113    case MO_SW:
2114        if (use_movbe) {
2115            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2116                                     datalo, h.base, h.index, 0, h.ofs);
2117            tcg_out_ext16s(s, type, datalo, datalo);
2118        } else {
2119            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2120                                     datalo, h.base, h.index, 0, h.ofs);
2121        }
2122        break;
2123    case MO_UL:
2124        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2125                                 h.base, h.index, 0, h.ofs);
2126        break;
2127#if TCG_TARGET_REG_BITS == 64
2128    case MO_SL:
2129        if (use_movbe) {
2130            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2131                                     h.base, h.index, 0, h.ofs);
2132            tcg_out_ext32s(s, datalo, datalo);
2133        } else {
2134            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2135                                     h.base, h.index, 0, h.ofs);
2136        }
2137        break;
2138#endif
2139    case MO_UQ:
2140        if (TCG_TARGET_REG_BITS == 64) {
2141            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2142                                     h.base, h.index, 0, h.ofs);
2143            break;
2144        }
2145        if (use_movbe) {
2146            TCGReg t = datalo;
2147            datalo = datahi;
2148            datahi = t;
2149        }
2150        if (h.base == datalo || h.index == datalo) {
2151            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2152                                     h.base, h.index, 0, h.ofs);
2153            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2154            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2155        } else {
2156            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2157                                     h.base, h.index, 0, h.ofs);
2158            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2159                                     h.base, h.index, 0, h.ofs + 4);
2160        }
2161        break;
2162    default:
2163        g_assert_not_reached();
2164    }
2165}
2166
2167static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2168                            TCGReg addrlo, TCGReg addrhi,
2169                            MemOpIdx oi, TCGType data_type)
2170{
2171    TCGLabelQemuLdst *ldst;
2172    HostAddress h;
2173
2174    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2175    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2176
2177    if (ldst) {
2178        ldst->type = data_type;
2179        ldst->datalo_reg = datalo;
2180        ldst->datahi_reg = datahi;
2181        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2182    }
2183}
2184
2185static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2186                                   HostAddress h, MemOp memop)
2187{
2188    bool use_movbe = false;
2189    int movop = OPC_MOVL_EvGv;
2190
2191    /*
2192     * Do big-endian stores with movbe or softmmu.
2193     * User-only without movbe will have its swapping done generically.
2194     */
2195    if (memop & MO_BSWAP) {
2196        tcg_debug_assert(have_movbe);
2197        use_movbe = true;
2198        movop = OPC_MOVBE_MyGy;
2199    }
2200
2201    switch (memop & MO_SIZE) {
2202    case MO_8:
2203        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2204        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2205        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2206                                 datalo, h.base, h.index, 0, h.ofs);
2207        break;
2208    case MO_16:
2209        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2210                                 h.base, h.index, 0, h.ofs);
2211        break;
2212    case MO_32:
2213        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2214                                 h.base, h.index, 0, h.ofs);
2215        break;
2216    case MO_64:
2217        if (TCG_TARGET_REG_BITS == 64) {
2218            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2219                                     h.base, h.index, 0, h.ofs);
2220        } else {
2221            if (use_movbe) {
2222                TCGReg t = datalo;
2223                datalo = datahi;
2224                datahi = t;
2225            }
2226            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2227                                     h.base, h.index, 0, h.ofs);
2228            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2229                                     h.base, h.index, 0, h.ofs + 4);
2230        }
2231        break;
2232    default:
2233        g_assert_not_reached();
2234    }
2235}
2236
2237static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2238                            TCGReg addrlo, TCGReg addrhi,
2239                            MemOpIdx oi, TCGType data_type)
2240{
2241    TCGLabelQemuLdst *ldst;
2242    HostAddress h;
2243
2244    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2245    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2246
2247    if (ldst) {
2248        ldst->type = data_type;
2249        ldst->datalo_reg = datalo;
2250        ldst->datahi_reg = datahi;
2251        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2252    }
2253}
2254
2255static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2256{
2257    /* Reuse the zeroing that exists for goto_ptr.  */
2258    if (a0 == 0) {
2259        tcg_out_jmp(s, tcg_code_gen_epilogue);
2260    } else {
2261        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2262        tcg_out_jmp(s, tb_ret_addr);
2263    }
2264}
2265
2266static void tcg_out_goto_tb(TCGContext *s, int which)
2267{
2268    /*
2269     * Jump displacement must be aligned for atomic patching;
2270     * see if we need to add extra nops before jump
2271     */
2272    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2273    if (gap != 1) {
2274        tcg_out_nopn(s, gap - 1);
2275    }
2276    tcg_out8(s, OPC_JMP_long); /* jmp im */
2277    set_jmp_insn_offset(s, which);
2278    tcg_out32(s, 0);
2279    set_jmp_reset_offset(s, which);
2280}
2281
2282void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2283                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2284{
2285    /* patch the branch destination */
2286    uintptr_t addr = tb->jmp_target_addr[n];
2287    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2288    /* no need to flush icache explicitly */
2289}
2290
2291static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2292                              const TCGArg args[TCG_MAX_OP_ARGS],
2293                              const int const_args[TCG_MAX_OP_ARGS])
2294{
2295    TCGArg a0, a1, a2;
2296    int c, const_a2, vexop, rexw = 0;
2297
2298#if TCG_TARGET_REG_BITS == 64
2299# define OP_32_64(x) \
2300        case glue(glue(INDEX_op_, x), _i64): \
2301            rexw = P_REXW; /* FALLTHRU */    \
2302        case glue(glue(INDEX_op_, x), _i32)
2303#else
2304# define OP_32_64(x) \
2305        case glue(glue(INDEX_op_, x), _i32)
2306#endif
2307
2308    /* Hoist the loads of the most common arguments.  */
2309    a0 = args[0];
2310    a1 = args[1];
2311    a2 = args[2];
2312    const_a2 = const_args[2];
2313
2314    switch (opc) {
2315    case INDEX_op_goto_ptr:
2316        /* jmp to the given host address (could be epilogue) */
2317        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2318        break;
2319    case INDEX_op_br:
2320        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2321        break;
2322    OP_32_64(ld8u):
2323        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2324        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2325        break;
2326    OP_32_64(ld8s):
2327        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2328        break;
2329    OP_32_64(ld16u):
2330        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2331        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2332        break;
2333    OP_32_64(ld16s):
2334        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2335        break;
2336#if TCG_TARGET_REG_BITS == 64
2337    case INDEX_op_ld32u_i64:
2338#endif
2339    case INDEX_op_ld_i32:
2340        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2341        break;
2342
2343    OP_32_64(st8):
2344        if (const_args[0]) {
2345            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2346            tcg_out8(s, a0);
2347        } else {
2348            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2349        }
2350        break;
2351    OP_32_64(st16):
2352        if (const_args[0]) {
2353            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2354            tcg_out16(s, a0);
2355        } else {
2356            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2357        }
2358        break;
2359#if TCG_TARGET_REG_BITS == 64
2360    case INDEX_op_st32_i64:
2361#endif
2362    case INDEX_op_st_i32:
2363        if (const_args[0]) {
2364            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2365            tcg_out32(s, a0);
2366        } else {
2367            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2368        }
2369        break;
2370
2371    OP_32_64(add):
2372        /* For 3-operand addition, use LEA.  */
2373        if (a0 != a1) {
2374            TCGArg c3 = 0;
2375            if (const_a2) {
2376                c3 = a2, a2 = -1;
2377            } else if (a0 == a2) {
2378                /* Watch out for dest = src + dest, since we've removed
2379                   the matching constraint on the add.  */
2380                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2381                break;
2382            }
2383
2384            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2385            break;
2386        }
2387        c = ARITH_ADD;
2388        goto gen_arith;
2389    OP_32_64(sub):
2390        c = ARITH_SUB;
2391        goto gen_arith;
2392    OP_32_64(and):
2393        c = ARITH_AND;
2394        goto gen_arith;
2395    OP_32_64(or):
2396        c = ARITH_OR;
2397        goto gen_arith;
2398    OP_32_64(xor):
2399        c = ARITH_XOR;
2400        goto gen_arith;
2401    gen_arith:
2402        if (const_a2) {
2403            tgen_arithi(s, c + rexw, a0, a2, 0);
2404        } else {
2405            tgen_arithr(s, c + rexw, a0, a2);
2406        }
2407        break;
2408
2409    OP_32_64(andc):
2410        if (const_a2) {
2411            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2412            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2413        } else {
2414            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2415        }
2416        break;
2417
2418    OP_32_64(mul):
2419        if (const_a2) {
2420            int32_t val;
2421            val = a2;
2422            if (val == (int8_t)val) {
2423                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2424                tcg_out8(s, val);
2425            } else {
2426                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2427                tcg_out32(s, val);
2428            }
2429        } else {
2430            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2431        }
2432        break;
2433
2434    OP_32_64(div2):
2435        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2436        break;
2437    OP_32_64(divu2):
2438        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2439        break;
2440
2441    OP_32_64(shl):
2442        /* For small constant 3-operand shift, use LEA.  */
2443        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2444            if (a2 - 1 == 0) {
2445                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2446                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2447            } else {
2448                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2449                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2450            }
2451            break;
2452        }
2453        c = SHIFT_SHL;
2454        vexop = OPC_SHLX;
2455        goto gen_shift_maybe_vex;
2456    OP_32_64(shr):
2457        c = SHIFT_SHR;
2458        vexop = OPC_SHRX;
2459        goto gen_shift_maybe_vex;
2460    OP_32_64(sar):
2461        c = SHIFT_SAR;
2462        vexop = OPC_SARX;
2463        goto gen_shift_maybe_vex;
2464    OP_32_64(rotl):
2465        c = SHIFT_ROL;
2466        goto gen_shift;
2467    OP_32_64(rotr):
2468        c = SHIFT_ROR;
2469        goto gen_shift;
2470    gen_shift_maybe_vex:
2471        if (have_bmi2) {
2472            if (!const_a2) {
2473                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2474                break;
2475            }
2476            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2477        }
2478        /* FALLTHRU */
2479    gen_shift:
2480        if (const_a2) {
2481            tcg_out_shifti(s, c + rexw, a0, a2);
2482        } else {
2483            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2484        }
2485        break;
2486
2487    OP_32_64(ctz):
2488        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2489        break;
2490    OP_32_64(clz):
2491        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2492        break;
2493    OP_32_64(ctpop):
2494        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2495        break;
2496
2497    case INDEX_op_brcond_i32:
2498        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2499        break;
2500    case INDEX_op_setcond_i32:
2501        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2502        break;
2503    case INDEX_op_movcond_i32:
2504        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2505        break;
2506
2507    OP_32_64(bswap16):
2508        if (a2 & TCG_BSWAP_OS) {
2509            /* Output must be sign-extended. */
2510            if (rexw) {
2511                tcg_out_bswap64(s, a0);
2512                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2513            } else {
2514                tcg_out_bswap32(s, a0);
2515                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2516            }
2517        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2518            /* Output must be zero-extended, but input isn't. */
2519            tcg_out_bswap32(s, a0);
2520            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2521        } else {
2522            tcg_out_rolw_8(s, a0);
2523        }
2524        break;
2525    OP_32_64(bswap32):
2526        tcg_out_bswap32(s, a0);
2527        if (rexw && (a2 & TCG_BSWAP_OS)) {
2528            tcg_out_ext32s(s, a0, a0);
2529        }
2530        break;
2531
2532    OP_32_64(neg):
2533        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2534        break;
2535    OP_32_64(not):
2536        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2537        break;
2538
2539    case INDEX_op_qemu_ld_i32:
2540        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2541            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2542        } else {
2543            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2544        }
2545        break;
2546    case INDEX_op_qemu_ld_i64:
2547        if (TCG_TARGET_REG_BITS == 64) {
2548            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2549        } else if (TARGET_LONG_BITS == 32) {
2550            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2551        } else {
2552            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2553        }
2554        break;
2555    case INDEX_op_qemu_st_i32:
2556    case INDEX_op_qemu_st8_i32:
2557        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2558            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2559        } else {
2560            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2561        }
2562        break;
2563    case INDEX_op_qemu_st_i64:
2564        if (TCG_TARGET_REG_BITS == 64) {
2565            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2566        } else if (TARGET_LONG_BITS == 32) {
2567            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2568        } else {
2569            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2570        }
2571        break;
2572
2573    OP_32_64(mulu2):
2574        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2575        break;
2576    OP_32_64(muls2):
2577        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2578        break;
2579    OP_32_64(add2):
2580        if (const_args[4]) {
2581            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2582        } else {
2583            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2584        }
2585        if (const_args[5]) {
2586            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2587        } else {
2588            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2589        }
2590        break;
2591    OP_32_64(sub2):
2592        if (const_args[4]) {
2593            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2594        } else {
2595            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2596        }
2597        if (const_args[5]) {
2598            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2599        } else {
2600            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2601        }
2602        break;
2603
2604#if TCG_TARGET_REG_BITS == 32
2605    case INDEX_op_brcond2_i32:
2606        tcg_out_brcond2(s, args, const_args, 0);
2607        break;
2608    case INDEX_op_setcond2_i32:
2609        tcg_out_setcond2(s, args, const_args);
2610        break;
2611#else /* TCG_TARGET_REG_BITS == 64 */
2612    case INDEX_op_ld32s_i64:
2613        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2614        break;
2615    case INDEX_op_ld_i64:
2616        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2617        break;
2618    case INDEX_op_st_i64:
2619        if (const_args[0]) {
2620            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2621            tcg_out32(s, a0);
2622        } else {
2623            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2624        }
2625        break;
2626
2627    case INDEX_op_brcond_i64:
2628        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2629        break;
2630    case INDEX_op_setcond_i64:
2631        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2632        break;
2633    case INDEX_op_movcond_i64:
2634        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2635        break;
2636
2637    case INDEX_op_bswap64_i64:
2638        tcg_out_bswap64(s, a0);
2639        break;
2640    case INDEX_op_extrh_i64_i32:
2641        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2642        break;
2643#endif
2644
2645    OP_32_64(deposit):
2646        if (args[3] == 0 && args[4] == 8) {
2647            /* load bits 0..7 */
2648            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2649        } else if (args[3] == 8 && args[4] == 8) {
2650            /* load bits 8..15 */
2651            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2652        } else if (args[3] == 0 && args[4] == 16) {
2653            /* load bits 0..15 */
2654            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2655        } else {
2656            g_assert_not_reached();
2657        }
2658        break;
2659
2660    case INDEX_op_extract_i64:
2661        if (a2 + args[3] == 32) {
2662            /* This is a 32-bit zero-extending right shift.  */
2663            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2664            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2665            break;
2666        }
2667        /* FALLTHRU */
2668    case INDEX_op_extract_i32:
2669        /* On the off-chance that we can use the high-byte registers.
2670           Otherwise we emit the same ext16 + shift pattern that we
2671           would have gotten from the normal tcg-op.c expansion.  */
2672        tcg_debug_assert(a2 == 8 && args[3] == 8);
2673        if (a1 < 4 && a0 < 8) {
2674            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2675        } else {
2676            tcg_out_ext16u(s, a0, a1);
2677            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2678        }
2679        break;
2680
2681    case INDEX_op_sextract_i32:
2682        /* We don't implement sextract_i64, as we cannot sign-extend to
2683           64-bits without using the REX prefix that explicitly excludes
2684           access to the high-byte registers.  */
2685        tcg_debug_assert(a2 == 8 && args[3] == 8);
2686        if (a1 < 4 && a0 < 8) {
2687            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2688        } else {
2689            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2690            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2691        }
2692        break;
2693
2694    OP_32_64(extract2):
2695        /* Note that SHRD outputs to the r/m operand.  */
2696        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2697        tcg_out8(s, args[3]);
2698        break;
2699
2700    case INDEX_op_mb:
2701        tcg_out_mb(s, a0);
2702        break;
2703    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2704    case INDEX_op_mov_i64:
2705    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2706    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2707    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2708    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2709    case INDEX_op_ext8s_i64:
2710    case INDEX_op_ext8u_i32:
2711    case INDEX_op_ext8u_i64:
2712    case INDEX_op_ext16s_i32:
2713    case INDEX_op_ext16s_i64:
2714    case INDEX_op_ext16u_i32:
2715    case INDEX_op_ext16u_i64:
2716    case INDEX_op_ext32s_i64:
2717    case INDEX_op_ext32u_i64:
2718    case INDEX_op_ext_i32_i64:
2719    case INDEX_op_extu_i32_i64:
2720    case INDEX_op_extrl_i64_i32:
2721    default:
2722        g_assert_not_reached();
2723    }
2724
2725#undef OP_32_64
2726}
2727
2728static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2729                           unsigned vecl, unsigned vece,
2730                           const TCGArg args[TCG_MAX_OP_ARGS],
2731                           const int const_args[TCG_MAX_OP_ARGS])
2732{
2733    static int const add_insn[4] = {
2734        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2735    };
2736    static int const ssadd_insn[4] = {
2737        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2738    };
2739    static int const usadd_insn[4] = {
2740        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2741    };
2742    static int const sub_insn[4] = {
2743        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2744    };
2745    static int const sssub_insn[4] = {
2746        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2747    };
2748    static int const ussub_insn[4] = {
2749        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2750    };
2751    static int const mul_insn[4] = {
2752        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2753    };
2754    static int const shift_imm_insn[4] = {
2755        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2756    };
2757    static int const cmpeq_insn[4] = {
2758        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2759    };
2760    static int const cmpgt_insn[4] = {
2761        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2762    };
2763    static int const punpckl_insn[4] = {
2764        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2765    };
2766    static int const punpckh_insn[4] = {
2767        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2768    };
2769    static int const packss_insn[4] = {
2770        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2771    };
2772    static int const packus_insn[4] = {
2773        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2774    };
2775    static int const smin_insn[4] = {
2776        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2777    };
2778    static int const smax_insn[4] = {
2779        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2780    };
2781    static int const umin_insn[4] = {
2782        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2783    };
2784    static int const umax_insn[4] = {
2785        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2786    };
2787    static int const rotlv_insn[4] = {
2788        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2789    };
2790    static int const rotrv_insn[4] = {
2791        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2792    };
2793    static int const shlv_insn[4] = {
2794        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2795    };
2796    static int const shrv_insn[4] = {
2797        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2798    };
2799    static int const sarv_insn[4] = {
2800        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2801    };
2802    static int const shls_insn[4] = {
2803        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2804    };
2805    static int const shrs_insn[4] = {
2806        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2807    };
2808    static int const sars_insn[4] = {
2809        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2810    };
2811    static int const vpshldi_insn[4] = {
2812        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2813    };
2814    static int const vpshldv_insn[4] = {
2815        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2816    };
2817    static int const vpshrdv_insn[4] = {
2818        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2819    };
2820    static int const abs_insn[4] = {
2821        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2822    };
2823
2824    TCGType type = vecl + TCG_TYPE_V64;
2825    int insn, sub;
2826    TCGArg a0, a1, a2, a3;
2827
2828    a0 = args[0];
2829    a1 = args[1];
2830    a2 = args[2];
2831
2832    switch (opc) {
2833    case INDEX_op_add_vec:
2834        insn = add_insn[vece];
2835        goto gen_simd;
2836    case INDEX_op_ssadd_vec:
2837        insn = ssadd_insn[vece];
2838        goto gen_simd;
2839    case INDEX_op_usadd_vec:
2840        insn = usadd_insn[vece];
2841        goto gen_simd;
2842    case INDEX_op_sub_vec:
2843        insn = sub_insn[vece];
2844        goto gen_simd;
2845    case INDEX_op_sssub_vec:
2846        insn = sssub_insn[vece];
2847        goto gen_simd;
2848    case INDEX_op_ussub_vec:
2849        insn = ussub_insn[vece];
2850        goto gen_simd;
2851    case INDEX_op_mul_vec:
2852        insn = mul_insn[vece];
2853        goto gen_simd;
2854    case INDEX_op_and_vec:
2855        insn = OPC_PAND;
2856        goto gen_simd;
2857    case INDEX_op_or_vec:
2858        insn = OPC_POR;
2859        goto gen_simd;
2860    case INDEX_op_xor_vec:
2861        insn = OPC_PXOR;
2862        goto gen_simd;
2863    case INDEX_op_smin_vec:
2864        insn = smin_insn[vece];
2865        goto gen_simd;
2866    case INDEX_op_umin_vec:
2867        insn = umin_insn[vece];
2868        goto gen_simd;
2869    case INDEX_op_smax_vec:
2870        insn = smax_insn[vece];
2871        goto gen_simd;
2872    case INDEX_op_umax_vec:
2873        insn = umax_insn[vece];
2874        goto gen_simd;
2875    case INDEX_op_shlv_vec:
2876        insn = shlv_insn[vece];
2877        goto gen_simd;
2878    case INDEX_op_shrv_vec:
2879        insn = shrv_insn[vece];
2880        goto gen_simd;
2881    case INDEX_op_sarv_vec:
2882        insn = sarv_insn[vece];
2883        goto gen_simd;
2884    case INDEX_op_rotlv_vec:
2885        insn = rotlv_insn[vece];
2886        goto gen_simd;
2887    case INDEX_op_rotrv_vec:
2888        insn = rotrv_insn[vece];
2889        goto gen_simd;
2890    case INDEX_op_shls_vec:
2891        insn = shls_insn[vece];
2892        goto gen_simd;
2893    case INDEX_op_shrs_vec:
2894        insn = shrs_insn[vece];
2895        goto gen_simd;
2896    case INDEX_op_sars_vec:
2897        insn = sars_insn[vece];
2898        goto gen_simd;
2899    case INDEX_op_x86_punpckl_vec:
2900        insn = punpckl_insn[vece];
2901        goto gen_simd;
2902    case INDEX_op_x86_punpckh_vec:
2903        insn = punpckh_insn[vece];
2904        goto gen_simd;
2905    case INDEX_op_x86_packss_vec:
2906        insn = packss_insn[vece];
2907        goto gen_simd;
2908    case INDEX_op_x86_packus_vec:
2909        insn = packus_insn[vece];
2910        goto gen_simd;
2911    case INDEX_op_x86_vpshldv_vec:
2912        insn = vpshldv_insn[vece];
2913        a1 = a2;
2914        a2 = args[3];
2915        goto gen_simd;
2916    case INDEX_op_x86_vpshrdv_vec:
2917        insn = vpshrdv_insn[vece];
2918        a1 = a2;
2919        a2 = args[3];
2920        goto gen_simd;
2921#if TCG_TARGET_REG_BITS == 32
2922    case INDEX_op_dup2_vec:
2923        /* First merge the two 32-bit inputs to a single 64-bit element. */
2924        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2925        /* Then replicate the 64-bit elements across the rest of the vector. */
2926        if (type != TCG_TYPE_V64) {
2927            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2928        }
2929        break;
2930#endif
2931    case INDEX_op_abs_vec:
2932        insn = abs_insn[vece];
2933        a2 = a1;
2934        a1 = 0;
2935        goto gen_simd;
2936    gen_simd:
2937        tcg_debug_assert(insn != OPC_UD2);
2938        if (type == TCG_TYPE_V256) {
2939            insn |= P_VEXL;
2940        }
2941        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2942        break;
2943
2944    case INDEX_op_cmp_vec:
2945        sub = args[3];
2946        if (sub == TCG_COND_EQ) {
2947            insn = cmpeq_insn[vece];
2948        } else if (sub == TCG_COND_GT) {
2949            insn = cmpgt_insn[vece];
2950        } else {
2951            g_assert_not_reached();
2952        }
2953        goto gen_simd;
2954
2955    case INDEX_op_andc_vec:
2956        insn = OPC_PANDN;
2957        if (type == TCG_TYPE_V256) {
2958            insn |= P_VEXL;
2959        }
2960        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2961        break;
2962
2963    case INDEX_op_shli_vec:
2964        insn = shift_imm_insn[vece];
2965        sub = 6;
2966        goto gen_shift;
2967    case INDEX_op_shri_vec:
2968        insn = shift_imm_insn[vece];
2969        sub = 2;
2970        goto gen_shift;
2971    case INDEX_op_sari_vec:
2972        if (vece == MO_64) {
2973            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
2974        } else {
2975            insn = shift_imm_insn[vece];
2976        }
2977        sub = 4;
2978        goto gen_shift;
2979    case INDEX_op_rotli_vec:
2980        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
2981        if (vece == MO_64) {
2982            insn |= P_VEXW;
2983        }
2984        sub = 1;
2985        goto gen_shift;
2986    gen_shift:
2987        tcg_debug_assert(vece != MO_8);
2988        if (type == TCG_TYPE_V256) {
2989            insn |= P_VEXL;
2990        }
2991        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2992        tcg_out8(s, a2);
2993        break;
2994
2995    case INDEX_op_ld_vec:
2996        tcg_out_ld(s, type, a0, a1, a2);
2997        break;
2998    case INDEX_op_st_vec:
2999        tcg_out_st(s, type, a0, a1, a2);
3000        break;
3001    case INDEX_op_dupm_vec:
3002        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3003        break;
3004
3005    case INDEX_op_x86_shufps_vec:
3006        insn = OPC_SHUFPS;
3007        sub = args[3];
3008        goto gen_simd_imm8;
3009    case INDEX_op_x86_blend_vec:
3010        if (vece == MO_16) {
3011            insn = OPC_PBLENDW;
3012        } else if (vece == MO_32) {
3013            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3014        } else {
3015            g_assert_not_reached();
3016        }
3017        sub = args[3];
3018        goto gen_simd_imm8;
3019    case INDEX_op_x86_vperm2i128_vec:
3020        insn = OPC_VPERM2I128;
3021        sub = args[3];
3022        goto gen_simd_imm8;
3023    case INDEX_op_x86_vpshldi_vec:
3024        insn = vpshldi_insn[vece];
3025        sub = args[3];
3026        goto gen_simd_imm8;
3027
3028    case INDEX_op_not_vec:
3029        insn = OPC_VPTERNLOGQ;
3030        a2 = a1;
3031        sub = 0x33; /* !B */
3032        goto gen_simd_imm8;
3033    case INDEX_op_nor_vec:
3034        insn = OPC_VPTERNLOGQ;
3035        sub = 0x11; /* norCB */
3036        goto gen_simd_imm8;
3037    case INDEX_op_nand_vec:
3038        insn = OPC_VPTERNLOGQ;
3039        sub = 0x77; /* nandCB */
3040        goto gen_simd_imm8;
3041    case INDEX_op_eqv_vec:
3042        insn = OPC_VPTERNLOGQ;
3043        sub = 0x99; /* xnorCB */
3044        goto gen_simd_imm8;
3045    case INDEX_op_orc_vec:
3046        insn = OPC_VPTERNLOGQ;
3047        sub = 0xdd; /* orB!C */
3048        goto gen_simd_imm8;
3049
3050    case INDEX_op_bitsel_vec:
3051        insn = OPC_VPTERNLOGQ;
3052        a3 = args[3];
3053        if (a0 == a1) {
3054            a1 = a2;
3055            a2 = a3;
3056            sub = 0xca; /* A?B:C */
3057        } else if (a0 == a2) {
3058            a2 = a3;
3059            sub = 0xe2; /* B?A:C */
3060        } else {
3061            tcg_out_mov(s, type, a0, a3);
3062            sub = 0xb8; /* B?C:A */
3063        }
3064        goto gen_simd_imm8;
3065
3066    gen_simd_imm8:
3067        tcg_debug_assert(insn != OPC_UD2);
3068        if (type == TCG_TYPE_V256) {
3069            insn |= P_VEXL;
3070        }
3071        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3072        tcg_out8(s, sub);
3073        break;
3074
3075    case INDEX_op_x86_vpblendvb_vec:
3076        insn = OPC_VPBLENDVB;
3077        if (type == TCG_TYPE_V256) {
3078            insn |= P_VEXL;
3079        }
3080        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3081        tcg_out8(s, args[3] << 4);
3082        break;
3083
3084    case INDEX_op_x86_psrldq_vec:
3085        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3086        tcg_out8(s, a2);
3087        break;
3088
3089    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3090    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3091    default:
3092        g_assert_not_reached();
3093    }
3094}
3095
3096static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3097{
3098    switch (op) {
3099    case INDEX_op_goto_ptr:
3100        return C_O0_I1(r);
3101
3102    case INDEX_op_ld8u_i32:
3103    case INDEX_op_ld8u_i64:
3104    case INDEX_op_ld8s_i32:
3105    case INDEX_op_ld8s_i64:
3106    case INDEX_op_ld16u_i32:
3107    case INDEX_op_ld16u_i64:
3108    case INDEX_op_ld16s_i32:
3109    case INDEX_op_ld16s_i64:
3110    case INDEX_op_ld_i32:
3111    case INDEX_op_ld32u_i64:
3112    case INDEX_op_ld32s_i64:
3113    case INDEX_op_ld_i64:
3114        return C_O1_I1(r, r);
3115
3116    case INDEX_op_st8_i32:
3117    case INDEX_op_st8_i64:
3118        return C_O0_I2(qi, r);
3119
3120    case INDEX_op_st16_i32:
3121    case INDEX_op_st16_i64:
3122    case INDEX_op_st_i32:
3123    case INDEX_op_st32_i64:
3124        return C_O0_I2(ri, r);
3125
3126    case INDEX_op_st_i64:
3127        return C_O0_I2(re, r);
3128
3129    case INDEX_op_add_i32:
3130    case INDEX_op_add_i64:
3131        return C_O1_I2(r, r, re);
3132
3133    case INDEX_op_sub_i32:
3134    case INDEX_op_sub_i64:
3135    case INDEX_op_mul_i32:
3136    case INDEX_op_mul_i64:
3137    case INDEX_op_or_i32:
3138    case INDEX_op_or_i64:
3139    case INDEX_op_xor_i32:
3140    case INDEX_op_xor_i64:
3141        return C_O1_I2(r, 0, re);
3142
3143    case INDEX_op_and_i32:
3144    case INDEX_op_and_i64:
3145        return C_O1_I2(r, 0, reZ);
3146
3147    case INDEX_op_andc_i32:
3148    case INDEX_op_andc_i64:
3149        return C_O1_I2(r, r, rI);
3150
3151    case INDEX_op_shl_i32:
3152    case INDEX_op_shl_i64:
3153    case INDEX_op_shr_i32:
3154    case INDEX_op_shr_i64:
3155    case INDEX_op_sar_i32:
3156    case INDEX_op_sar_i64:
3157        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3158
3159    case INDEX_op_rotl_i32:
3160    case INDEX_op_rotl_i64:
3161    case INDEX_op_rotr_i32:
3162    case INDEX_op_rotr_i64:
3163        return C_O1_I2(r, 0, ci);
3164
3165    case INDEX_op_brcond_i32:
3166    case INDEX_op_brcond_i64:
3167        return C_O0_I2(r, re);
3168
3169    case INDEX_op_bswap16_i32:
3170    case INDEX_op_bswap16_i64:
3171    case INDEX_op_bswap32_i32:
3172    case INDEX_op_bswap32_i64:
3173    case INDEX_op_bswap64_i64:
3174    case INDEX_op_neg_i32:
3175    case INDEX_op_neg_i64:
3176    case INDEX_op_not_i32:
3177    case INDEX_op_not_i64:
3178    case INDEX_op_extrh_i64_i32:
3179        return C_O1_I1(r, 0);
3180
3181    case INDEX_op_ext8s_i32:
3182    case INDEX_op_ext8s_i64:
3183    case INDEX_op_ext8u_i32:
3184    case INDEX_op_ext8u_i64:
3185        return C_O1_I1(r, q);
3186
3187    case INDEX_op_ext16s_i32:
3188    case INDEX_op_ext16s_i64:
3189    case INDEX_op_ext16u_i32:
3190    case INDEX_op_ext16u_i64:
3191    case INDEX_op_ext32s_i64:
3192    case INDEX_op_ext32u_i64:
3193    case INDEX_op_ext_i32_i64:
3194    case INDEX_op_extu_i32_i64:
3195    case INDEX_op_extrl_i64_i32:
3196    case INDEX_op_extract_i32:
3197    case INDEX_op_extract_i64:
3198    case INDEX_op_sextract_i32:
3199    case INDEX_op_ctpop_i32:
3200    case INDEX_op_ctpop_i64:
3201        return C_O1_I1(r, r);
3202
3203    case INDEX_op_extract2_i32:
3204    case INDEX_op_extract2_i64:
3205        return C_O1_I2(r, 0, r);
3206
3207    case INDEX_op_deposit_i32:
3208    case INDEX_op_deposit_i64:
3209        return C_O1_I2(Q, 0, Q);
3210
3211    case INDEX_op_setcond_i32:
3212    case INDEX_op_setcond_i64:
3213        return C_O1_I2(q, r, re);
3214
3215    case INDEX_op_movcond_i32:
3216    case INDEX_op_movcond_i64:
3217        return C_O1_I4(r, r, re, r, 0);
3218
3219    case INDEX_op_div2_i32:
3220    case INDEX_op_div2_i64:
3221    case INDEX_op_divu2_i32:
3222    case INDEX_op_divu2_i64:
3223        return C_O2_I3(a, d, 0, 1, r);
3224
3225    case INDEX_op_mulu2_i32:
3226    case INDEX_op_mulu2_i64:
3227    case INDEX_op_muls2_i32:
3228    case INDEX_op_muls2_i64:
3229        return C_O2_I2(a, d, a, r);
3230
3231    case INDEX_op_add2_i32:
3232    case INDEX_op_add2_i64:
3233    case INDEX_op_sub2_i32:
3234    case INDEX_op_sub2_i64:
3235        return C_O2_I4(r, r, 0, 1, re, re);
3236
3237    case INDEX_op_ctz_i32:
3238    case INDEX_op_ctz_i64:
3239        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3240
3241    case INDEX_op_clz_i32:
3242    case INDEX_op_clz_i64:
3243        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3244
3245    case INDEX_op_qemu_ld_i32:
3246        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3247                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3248
3249    case INDEX_op_qemu_st_i32:
3250        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3251                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3252    case INDEX_op_qemu_st8_i32:
3253        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3254                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3255
3256    case INDEX_op_qemu_ld_i64:
3257        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3258                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3259                : C_O2_I2(r, r, L, L));
3260
3261    case INDEX_op_qemu_st_i64:
3262        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3263                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3264                : C_O0_I4(L, L, L, L));
3265
3266    case INDEX_op_brcond2_i32:
3267        return C_O0_I4(r, r, ri, ri);
3268
3269    case INDEX_op_setcond2_i32:
3270        return C_O1_I4(r, r, r, ri, ri);
3271
3272    case INDEX_op_ld_vec:
3273    case INDEX_op_dupm_vec:
3274        return C_O1_I1(x, r);
3275
3276    case INDEX_op_st_vec:
3277        return C_O0_I2(x, r);
3278
3279    case INDEX_op_add_vec:
3280    case INDEX_op_sub_vec:
3281    case INDEX_op_mul_vec:
3282    case INDEX_op_and_vec:
3283    case INDEX_op_or_vec:
3284    case INDEX_op_xor_vec:
3285    case INDEX_op_andc_vec:
3286    case INDEX_op_orc_vec:
3287    case INDEX_op_nand_vec:
3288    case INDEX_op_nor_vec:
3289    case INDEX_op_eqv_vec:
3290    case INDEX_op_ssadd_vec:
3291    case INDEX_op_usadd_vec:
3292    case INDEX_op_sssub_vec:
3293    case INDEX_op_ussub_vec:
3294    case INDEX_op_smin_vec:
3295    case INDEX_op_umin_vec:
3296    case INDEX_op_smax_vec:
3297    case INDEX_op_umax_vec:
3298    case INDEX_op_shlv_vec:
3299    case INDEX_op_shrv_vec:
3300    case INDEX_op_sarv_vec:
3301    case INDEX_op_rotlv_vec:
3302    case INDEX_op_rotrv_vec:
3303    case INDEX_op_shls_vec:
3304    case INDEX_op_shrs_vec:
3305    case INDEX_op_sars_vec:
3306    case INDEX_op_cmp_vec:
3307    case INDEX_op_x86_shufps_vec:
3308    case INDEX_op_x86_blend_vec:
3309    case INDEX_op_x86_packss_vec:
3310    case INDEX_op_x86_packus_vec:
3311    case INDEX_op_x86_vperm2i128_vec:
3312    case INDEX_op_x86_punpckl_vec:
3313    case INDEX_op_x86_punpckh_vec:
3314    case INDEX_op_x86_vpshldi_vec:
3315#if TCG_TARGET_REG_BITS == 32
3316    case INDEX_op_dup2_vec:
3317#endif
3318        return C_O1_I2(x, x, x);
3319
3320    case INDEX_op_abs_vec:
3321    case INDEX_op_dup_vec:
3322    case INDEX_op_not_vec:
3323    case INDEX_op_shli_vec:
3324    case INDEX_op_shri_vec:
3325    case INDEX_op_sari_vec:
3326    case INDEX_op_rotli_vec:
3327    case INDEX_op_x86_psrldq_vec:
3328        return C_O1_I1(x, x);
3329
3330    case INDEX_op_x86_vpshldv_vec:
3331    case INDEX_op_x86_vpshrdv_vec:
3332        return C_O1_I3(x, 0, x, x);
3333
3334    case INDEX_op_bitsel_vec:
3335    case INDEX_op_x86_vpblendvb_vec:
3336        return C_O1_I3(x, x, x, x);
3337
3338    default:
3339        g_assert_not_reached();
3340    }
3341}
3342
3343int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3344{
3345    switch (opc) {
3346    case INDEX_op_add_vec:
3347    case INDEX_op_sub_vec:
3348    case INDEX_op_and_vec:
3349    case INDEX_op_or_vec:
3350    case INDEX_op_xor_vec:
3351    case INDEX_op_andc_vec:
3352    case INDEX_op_orc_vec:
3353    case INDEX_op_nand_vec:
3354    case INDEX_op_nor_vec:
3355    case INDEX_op_eqv_vec:
3356    case INDEX_op_not_vec:
3357    case INDEX_op_bitsel_vec:
3358        return 1;
3359    case INDEX_op_cmp_vec:
3360    case INDEX_op_cmpsel_vec:
3361        return -1;
3362
3363    case INDEX_op_rotli_vec:
3364        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3365
3366    case INDEX_op_shli_vec:
3367    case INDEX_op_shri_vec:
3368        /* We must expand the operation for MO_8.  */
3369        return vece == MO_8 ? -1 : 1;
3370
3371    case INDEX_op_sari_vec:
3372        switch (vece) {
3373        case MO_8:
3374            return -1;
3375        case MO_16:
3376        case MO_32:
3377            return 1;
3378        case MO_64:
3379            if (have_avx512vl) {
3380                return 1;
3381            }
3382            /*
3383             * We can emulate this for MO_64, but it does not pay off
3384             * unless we're producing at least 4 values.
3385             */
3386            return type >= TCG_TYPE_V256 ? -1 : 0;
3387        }
3388        return 0;
3389
3390    case INDEX_op_shls_vec:
3391    case INDEX_op_shrs_vec:
3392        return vece >= MO_16;
3393    case INDEX_op_sars_vec:
3394        switch (vece) {
3395        case MO_16:
3396        case MO_32:
3397            return 1;
3398        case MO_64:
3399            return have_avx512vl;
3400        }
3401        return 0;
3402    case INDEX_op_rotls_vec:
3403        return vece >= MO_16 ? -1 : 0;
3404
3405    case INDEX_op_shlv_vec:
3406    case INDEX_op_shrv_vec:
3407        switch (vece) {
3408        case MO_16:
3409            return have_avx512bw;
3410        case MO_32:
3411        case MO_64:
3412            return have_avx2;
3413        }
3414        return 0;
3415    case INDEX_op_sarv_vec:
3416        switch (vece) {
3417        case MO_16:
3418            return have_avx512bw;
3419        case MO_32:
3420            return have_avx2;
3421        case MO_64:
3422            return have_avx512vl;
3423        }
3424        return 0;
3425    case INDEX_op_rotlv_vec:
3426    case INDEX_op_rotrv_vec:
3427        switch (vece) {
3428        case MO_16:
3429            return have_avx512vbmi2 ? -1 : 0;
3430        case MO_32:
3431        case MO_64:
3432            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3433        }
3434        return 0;
3435
3436    case INDEX_op_mul_vec:
3437        switch (vece) {
3438        case MO_8:
3439            return -1;
3440        case MO_64:
3441            return have_avx512dq;
3442        }
3443        return 1;
3444
3445    case INDEX_op_ssadd_vec:
3446    case INDEX_op_usadd_vec:
3447    case INDEX_op_sssub_vec:
3448    case INDEX_op_ussub_vec:
3449        return vece <= MO_16;
3450    case INDEX_op_smin_vec:
3451    case INDEX_op_smax_vec:
3452    case INDEX_op_umin_vec:
3453    case INDEX_op_umax_vec:
3454    case INDEX_op_abs_vec:
3455        return vece <= MO_32 || have_avx512vl;
3456
3457    default:
3458        return 0;
3459    }
3460}
3461
3462static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3463                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3464{
3465    TCGv_vec t1, t2;
3466
3467    tcg_debug_assert(vece == MO_8);
3468
3469    t1 = tcg_temp_new_vec(type);
3470    t2 = tcg_temp_new_vec(type);
3471
3472    /*
3473     * Unpack to W, shift, and repack.  Tricky bits:
3474     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3475     *     i.e. duplicate in other half of the 16-bit lane.
3476     * (2) For right-shift, add 8 so that the high half of the lane
3477     *     becomes zero.  For left-shift, and left-rotate, we must
3478     *     shift up and down again.
3479     * (3) Step 2 leaves high half zero such that PACKUSWB
3480     *     (pack with unsigned saturation) does not modify
3481     *     the quantity.
3482     */
3483    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3484              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3485    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3486              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3487
3488    if (opc != INDEX_op_rotli_vec) {
3489        imm += 8;
3490    }
3491    if (opc == INDEX_op_shri_vec) {
3492        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3493        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3494    } else {
3495        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3496        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3497        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3498        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3499    }
3500
3501    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3502              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3503    tcg_temp_free_vec(t1);
3504    tcg_temp_free_vec(t2);
3505}
3506
3507static void expand_vec_sari(TCGType type, unsigned vece,
3508                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3509{
3510    TCGv_vec t1, t2;
3511
3512    switch (vece) {
3513    case MO_8:
3514        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3515        t1 = tcg_temp_new_vec(type);
3516        t2 = tcg_temp_new_vec(type);
3517        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3518                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3519        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3520                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3521        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3522        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3523        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3524                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3525        tcg_temp_free_vec(t1);
3526        tcg_temp_free_vec(t2);
3527        break;
3528
3529    case MO_64:
3530        t1 = tcg_temp_new_vec(type);
3531        if (imm <= 32) {
3532            /*
3533             * We can emulate a small sign extend by performing an arithmetic
3534             * 32-bit shift and overwriting the high half of a 64-bit logical
3535             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3536             * does not, so we have to bound the smaller shift -- we get the
3537             * same result in the high half either way.
3538             */
3539            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3540            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3541            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3542                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3543                      tcgv_vec_arg(t1), 0xaa);
3544        } else {
3545            /* Otherwise we will need to use a compare vs 0 to produce
3546             * the sign-extend, shift and merge.
3547             */
3548            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3549                            tcg_constant_vec(type, MO_64, 0), v1);
3550            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3551            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3552            tcg_gen_or_vec(MO_64, v0, v0, t1);
3553        }
3554        tcg_temp_free_vec(t1);
3555        break;
3556
3557    default:
3558        g_assert_not_reached();
3559    }
3560}
3561
3562static void expand_vec_rotli(TCGType type, unsigned vece,
3563                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3564{
3565    TCGv_vec t;
3566
3567    if (vece == MO_8) {
3568        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3569        return;
3570    }
3571
3572    if (have_avx512vbmi2) {
3573        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3574                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3575        return;
3576    }
3577
3578    t = tcg_temp_new_vec(type);
3579    tcg_gen_shli_vec(vece, t, v1, imm);
3580    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3581    tcg_gen_or_vec(vece, v0, v0, t);
3582    tcg_temp_free_vec(t);
3583}
3584
3585static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3586                            TCGv_vec v1, TCGv_vec sh, bool right)
3587{
3588    TCGv_vec t;
3589
3590    if (have_avx512vbmi2) {
3591        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3592                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3593                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3594        return;
3595    }
3596
3597    t = tcg_temp_new_vec(type);
3598    tcg_gen_dupi_vec(vece, t, 8 << vece);
3599    tcg_gen_sub_vec(vece, t, t, sh);
3600    if (right) {
3601        tcg_gen_shlv_vec(vece, t, v1, t);
3602        tcg_gen_shrv_vec(vece, v0, v1, sh);
3603    } else {
3604        tcg_gen_shrv_vec(vece, t, v1, t);
3605        tcg_gen_shlv_vec(vece, v0, v1, sh);
3606    }
3607    tcg_gen_or_vec(vece, v0, v0, t);
3608    tcg_temp_free_vec(t);
3609}
3610
3611static void expand_vec_rotls(TCGType type, unsigned vece,
3612                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3613{
3614    TCGv_vec t = tcg_temp_new_vec(type);
3615
3616    tcg_debug_assert(vece != MO_8);
3617
3618    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3619        tcg_gen_dup_i32_vec(vece, t, lsh);
3620        if (vece >= MO_32) {
3621            tcg_gen_rotlv_vec(vece, v0, v1, t);
3622        } else {
3623            expand_vec_rotv(type, vece, v0, v1, t, false);
3624        }
3625    } else {
3626        TCGv_i32 rsh = tcg_temp_new_i32();
3627
3628        tcg_gen_neg_i32(rsh, lsh);
3629        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3630        tcg_gen_shls_vec(vece, t, v1, lsh);
3631        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3632        tcg_gen_or_vec(vece, v0, v0, t);
3633
3634        tcg_temp_free_i32(rsh);
3635    }
3636
3637    tcg_temp_free_vec(t);
3638}
3639
3640static void expand_vec_mul(TCGType type, unsigned vece,
3641                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3642{
3643    TCGv_vec t1, t2, t3, t4, zero;
3644
3645    tcg_debug_assert(vece == MO_8);
3646
3647    /*
3648     * Unpack v1 bytes to words, 0 | x.
3649     * Unpack v2 bytes to words, y | 0.
3650     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3651     * Shift logical right by 8 bits to clear the high 8 bytes before
3652     * using an unsigned saturated pack.
3653     *
3654     * The difference between the V64, V128 and V256 cases is merely how
3655     * we distribute the expansion between temporaries.
3656     */
3657    switch (type) {
3658    case TCG_TYPE_V64:
3659        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3660        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3661        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3662        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3663                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3664        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3665                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3666        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3667        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3668        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3669                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3670        tcg_temp_free_vec(t1);
3671        tcg_temp_free_vec(t2);
3672        break;
3673
3674    case TCG_TYPE_V128:
3675    case TCG_TYPE_V256:
3676        t1 = tcg_temp_new_vec(type);
3677        t2 = tcg_temp_new_vec(type);
3678        t3 = tcg_temp_new_vec(type);
3679        t4 = tcg_temp_new_vec(type);
3680        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3681        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3682                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3683        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3684                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3685        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3686                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3687        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3688                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3689        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3690        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3691        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3692        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3693        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3694                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3695        tcg_temp_free_vec(t1);
3696        tcg_temp_free_vec(t2);
3697        tcg_temp_free_vec(t3);
3698        tcg_temp_free_vec(t4);
3699        break;
3700
3701    default:
3702        g_assert_not_reached();
3703    }
3704}
3705
3706static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3707                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3708{
3709    enum {
3710        NEED_INV  = 1,
3711        NEED_SWAP = 2,
3712        NEED_BIAS = 4,
3713        NEED_UMIN = 8,
3714        NEED_UMAX = 16,
3715    };
3716    TCGv_vec t1, t2, t3;
3717    uint8_t fixup;
3718
3719    switch (cond) {
3720    case TCG_COND_EQ:
3721    case TCG_COND_GT:
3722        fixup = 0;
3723        break;
3724    case TCG_COND_NE:
3725    case TCG_COND_LE:
3726        fixup = NEED_INV;
3727        break;
3728    case TCG_COND_LT:
3729        fixup = NEED_SWAP;
3730        break;
3731    case TCG_COND_GE:
3732        fixup = NEED_SWAP | NEED_INV;
3733        break;
3734    case TCG_COND_LEU:
3735        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3736            fixup = NEED_UMIN;
3737        } else {
3738            fixup = NEED_BIAS | NEED_INV;
3739        }
3740        break;
3741    case TCG_COND_GTU:
3742        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3743            fixup = NEED_UMIN | NEED_INV;
3744        } else {
3745            fixup = NEED_BIAS;
3746        }
3747        break;
3748    case TCG_COND_GEU:
3749        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3750            fixup = NEED_UMAX;
3751        } else {
3752            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3753        }
3754        break;
3755    case TCG_COND_LTU:
3756        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3757            fixup = NEED_UMAX | NEED_INV;
3758        } else {
3759            fixup = NEED_BIAS | NEED_SWAP;
3760        }
3761        break;
3762    default:
3763        g_assert_not_reached();
3764    }
3765
3766    if (fixup & NEED_INV) {
3767        cond = tcg_invert_cond(cond);
3768    }
3769    if (fixup & NEED_SWAP) {
3770        t1 = v1, v1 = v2, v2 = t1;
3771        cond = tcg_swap_cond(cond);
3772    }
3773
3774    t1 = t2 = NULL;
3775    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3776        t1 = tcg_temp_new_vec(type);
3777        if (fixup & NEED_UMIN) {
3778            tcg_gen_umin_vec(vece, t1, v1, v2);
3779        } else {
3780            tcg_gen_umax_vec(vece, t1, v1, v2);
3781        }
3782        v2 = t1;
3783        cond = TCG_COND_EQ;
3784    } else if (fixup & NEED_BIAS) {
3785        t1 = tcg_temp_new_vec(type);
3786        t2 = tcg_temp_new_vec(type);
3787        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3788        tcg_gen_sub_vec(vece, t1, v1, t3);
3789        tcg_gen_sub_vec(vece, t2, v2, t3);
3790        v1 = t1;
3791        v2 = t2;
3792        cond = tcg_signed_cond(cond);
3793    }
3794
3795    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3796    /* Expand directly; do not recurse.  */
3797    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3798              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3799
3800    if (t1) {
3801        tcg_temp_free_vec(t1);
3802        if (t2) {
3803            tcg_temp_free_vec(t2);
3804        }
3805    }
3806    return fixup & NEED_INV;
3807}
3808
3809static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3810                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3811{
3812    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3813        tcg_gen_not_vec(vece, v0, v0);
3814    }
3815}
3816
3817static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3818                              TCGv_vec c1, TCGv_vec c2,
3819                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3820{
3821    TCGv_vec t = tcg_temp_new_vec(type);
3822
3823    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3824        /* Invert the sense of the compare by swapping arguments.  */
3825        TCGv_vec x;
3826        x = v3, v3 = v4, v4 = x;
3827    }
3828    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3829              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3830              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3831    tcg_temp_free_vec(t);
3832}
3833
3834void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3835                       TCGArg a0, ...)
3836{
3837    va_list va;
3838    TCGArg a2;
3839    TCGv_vec v0, v1, v2, v3, v4;
3840
3841    va_start(va, a0);
3842    v0 = temp_tcgv_vec(arg_temp(a0));
3843    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3844    a2 = va_arg(va, TCGArg);
3845
3846    switch (opc) {
3847    case INDEX_op_shli_vec:
3848    case INDEX_op_shri_vec:
3849        expand_vec_shi(type, vece, opc, v0, v1, a2);
3850        break;
3851
3852    case INDEX_op_sari_vec:
3853        expand_vec_sari(type, vece, v0, v1, a2);
3854        break;
3855
3856    case INDEX_op_rotli_vec:
3857        expand_vec_rotli(type, vece, v0, v1, a2);
3858        break;
3859
3860    case INDEX_op_rotls_vec:
3861        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3862        break;
3863
3864    case INDEX_op_rotlv_vec:
3865        v2 = temp_tcgv_vec(arg_temp(a2));
3866        expand_vec_rotv(type, vece, v0, v1, v2, false);
3867        break;
3868    case INDEX_op_rotrv_vec:
3869        v2 = temp_tcgv_vec(arg_temp(a2));
3870        expand_vec_rotv(type, vece, v0, v1, v2, true);
3871        break;
3872
3873    case INDEX_op_mul_vec:
3874        v2 = temp_tcgv_vec(arg_temp(a2));
3875        expand_vec_mul(type, vece, v0, v1, v2);
3876        break;
3877
3878    case INDEX_op_cmp_vec:
3879        v2 = temp_tcgv_vec(arg_temp(a2));
3880        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3881        break;
3882
3883    case INDEX_op_cmpsel_vec:
3884        v2 = temp_tcgv_vec(arg_temp(a2));
3885        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3886        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3887        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3888        break;
3889
3890    default:
3891        break;
3892    }
3893
3894    va_end(va);
3895}
3896
3897static const int tcg_target_callee_save_regs[] = {
3898#if TCG_TARGET_REG_BITS == 64
3899    TCG_REG_RBP,
3900    TCG_REG_RBX,
3901#if defined(_WIN64)
3902    TCG_REG_RDI,
3903    TCG_REG_RSI,
3904#endif
3905    TCG_REG_R12,
3906    TCG_REG_R13,
3907    TCG_REG_R14, /* Currently used for the global env. */
3908    TCG_REG_R15,
3909#else
3910    TCG_REG_EBP, /* Currently used for the global env. */
3911    TCG_REG_EBX,
3912    TCG_REG_ESI,
3913    TCG_REG_EDI,
3914#endif
3915};
3916
3917/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3918   and tcg_register_jit.  */
3919
3920#define PUSH_SIZE \
3921    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3922     * (TCG_TARGET_REG_BITS / 8))
3923
3924#define FRAME_SIZE \
3925    ((PUSH_SIZE \
3926      + TCG_STATIC_CALL_ARGS_SIZE \
3927      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3928      + TCG_TARGET_STACK_ALIGN - 1) \
3929     & ~(TCG_TARGET_STACK_ALIGN - 1))
3930
3931/* Generate global QEMU prologue and epilogue code */
3932static void tcg_target_qemu_prologue(TCGContext *s)
3933{
3934    int i, stack_addend;
3935
3936    /* TB prologue */
3937
3938    /* Reserve some stack space, also for TCG temps.  */
3939    stack_addend = FRAME_SIZE - PUSH_SIZE;
3940    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3941                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3942
3943    /* Save all callee saved registers.  */
3944    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3945        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3946    }
3947
3948#if TCG_TARGET_REG_BITS == 32
3949    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3950               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3951    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3952    /* jmp *tb.  */
3953    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3954                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3955                         + stack_addend);
3956#else
3957# if !defined(CONFIG_SOFTMMU)
3958    if (guest_base) {
3959        int seg = setup_guest_base_seg();
3960        if (seg != 0) {
3961            x86_guest_base.seg = seg;
3962        } else if (guest_base == (int32_t)guest_base) {
3963            x86_guest_base.ofs = guest_base;
3964        } else {
3965            /* Choose R12 because, as a base, it requires a SIB byte. */
3966            x86_guest_base.index = TCG_REG_R12;
3967            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
3968            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
3969        }
3970    }
3971# endif
3972    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3973    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3974    /* jmp *tb.  */
3975    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3976#endif
3977
3978    /*
3979     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3980     * and fall through to the rest of the epilogue.
3981     */
3982    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3983    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3984
3985    /* TB epilogue */
3986    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3987
3988    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3989
3990    if (have_avx2) {
3991        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3992    }
3993    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3994        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3995    }
3996    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3997}
3998
3999static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4000{
4001    memset(p, 0x90, count);
4002}
4003
4004static void tcg_target_init(TCGContext *s)
4005{
4006#ifdef CONFIG_CPUID_H
4007    unsigned a, b, c, d, b7 = 0, c7 = 0;
4008    unsigned max = __get_cpuid_max(0, 0);
4009
4010    if (max >= 7) {
4011        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
4012        __cpuid_count(7, 0, a, b7, c7, d);
4013        have_bmi1 = (b7 & bit_BMI) != 0;
4014        have_bmi2 = (b7 & bit_BMI2) != 0;
4015    }
4016
4017    if (max >= 1) {
4018        __cpuid(1, a, b, c, d);
4019#ifndef have_cmov
4020        /* For 32-bit, 99% certainty that we're running on hardware that
4021           supports cmov, but we still need to check.  In case cmov is not
4022           available, we'll use a small forward branch.  */
4023        have_cmov = (d & bit_CMOV) != 0;
4024#endif
4025
4026        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
4027           need to probe for it.  */
4028        have_movbe = (c & bit_MOVBE) != 0;
4029        have_popcnt = (c & bit_POPCNT) != 0;
4030
4031        /* There are a number of things we must check before we can be
4032           sure of not hitting invalid opcode.  */
4033        if (c & bit_OSXSAVE) {
4034            unsigned bv = xgetbv_low(0);
4035
4036            if ((bv & 6) == 6) {
4037                have_avx1 = (c & bit_AVX) != 0;
4038                have_avx2 = (b7 & bit_AVX2) != 0;
4039
4040                /*
4041                 * There are interesting instructions in AVX512, so long
4042                 * as we have AVX512VL, which indicates support for EVEX
4043                 * on sizes smaller than 512 bits.  We are required to
4044                 * check that OPMASK and all extended ZMM state are enabled
4045                 * even if we're not using them -- the insns will fault.
4046                 */
4047                if ((bv & 0xe0) == 0xe0
4048                    && (b7 & bit_AVX512F)
4049                    && (b7 & bit_AVX512VL)) {
4050                    have_avx512vl = true;
4051                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
4052                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
4053                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
4054                }
4055            }
4056        }
4057    }
4058
4059    max = __get_cpuid_max(0x8000000, 0);
4060    if (max >= 1) {
4061        __cpuid(0x80000001, a, b, c, d);
4062        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4063        have_lzcnt = (c & bit_LZCNT) != 0;
4064    }
4065#endif /* CONFIG_CPUID_H */
4066
4067    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4068    if (TCG_TARGET_REG_BITS == 64) {
4069        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4070    }
4071    if (have_avx1) {
4072        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4073        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4074    }
4075    if (have_avx2) {
4076        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4077    }
4078
4079    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4080    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4081    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4082    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4083    if (TCG_TARGET_REG_BITS == 64) {
4084#if !defined(_WIN64)
4085        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4086        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4087#endif
4088        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4089        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4090        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4091        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4092    }
4093
4094    s->reserved_regs = 0;
4095    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4096#ifdef _WIN64
4097    /* These are call saved, and we don't save them, so don't use them. */
4098    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4099    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4100    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4101    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4102    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4103    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4104    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4105    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4106    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4107    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4108#endif
4109}
4110
4111typedef struct {
4112    DebugFrameHeader h;
4113    uint8_t fde_def_cfa[4];
4114    uint8_t fde_reg_ofs[14];
4115} DebugFrame;
4116
4117/* We're expecting a 2 byte uleb128 encoded value.  */
4118QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4119
4120#if !defined(__ELF__)
4121    /* Host machine without ELF. */
4122#elif TCG_TARGET_REG_BITS == 64
4123#define ELF_HOST_MACHINE EM_X86_64
4124static const DebugFrame debug_frame = {
4125    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4126    .h.cie.id = -1,
4127    .h.cie.version = 1,
4128    .h.cie.code_align = 1,
4129    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4130    .h.cie.return_column = 16,
4131
4132    /* Total FDE size does not include the "len" member.  */
4133    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4134
4135    .fde_def_cfa = {
4136        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4137        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4138        (FRAME_SIZE >> 7)
4139    },
4140    .fde_reg_ofs = {
4141        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4142        /* The following ordering must match tcg_target_callee_save_regs.  */
4143        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4144        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4145        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4146        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4147        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4148        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4149    }
4150};
4151#else
4152#define ELF_HOST_MACHINE EM_386
4153static const DebugFrame debug_frame = {
4154    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4155    .h.cie.id = -1,
4156    .h.cie.version = 1,
4157    .h.cie.code_align = 1,
4158    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4159    .h.cie.return_column = 8,
4160
4161    /* Total FDE size does not include the "len" member.  */
4162    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4163
4164    .fde_def_cfa = {
4165        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4166        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4167        (FRAME_SIZE >> 7)
4168    },
4169    .fde_reg_ofs = {
4170        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4171        /* The following ordering must match tcg_target_callee_save_regs.  */
4172        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4173        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4174        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4175        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4176    }
4177};
4178#endif
4179
4180#if defined(ELF_HOST_MACHINE)
4181void tcg_register_jit(const void *buf, size_t buf_size)
4182{
4183    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4184}
4185#endif
4186