xref: /qemu/tcg/i386/tcg-target.c.inc (revision 370ed600)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
113{
114    switch (kind) {
115    case TCG_CALL_RET_NORMAL:
116        tcg_debug_assert(slot >= 0 && slot <= 1);
117        return slot ? TCG_REG_EDX : TCG_REG_EAX;
118#ifdef _WIN64
119    case TCG_CALL_RET_BY_VEC:
120        tcg_debug_assert(slot == 0);
121        return TCG_REG_XMM0;
122#endif
123    default:
124        g_assert_not_reached();
125    }
126}
127
128/* Constants we accept.  */
129#define TCG_CT_CONST_S32 0x100
130#define TCG_CT_CONST_U32 0x200
131#define TCG_CT_CONST_I32 0x400
132#define TCG_CT_CONST_WSZ 0x800
133
134/* Registers used with L constraint, which are the first argument
135   registers on x86_64, and two random call clobbered registers on
136   i386. */
137#if TCG_TARGET_REG_BITS == 64
138# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
139# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
140#else
141# define TCG_REG_L0 TCG_REG_EAX
142# define TCG_REG_L1 TCG_REG_EDX
143#endif
144
145#define ALL_BYTEH_REGS         0x0000000fu
146#if TCG_TARGET_REG_BITS == 64
147# define ALL_GENERAL_REGS      0x0000ffffu
148# define ALL_VECTOR_REGS       0xffff0000u
149# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
150#else
151# define ALL_GENERAL_REGS      0x000000ffu
152# define ALL_VECTOR_REGS       0x00ff0000u
153# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
154#endif
155#ifdef CONFIG_SOFTMMU
156# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
157#else
158# define SOFTMMU_RESERVE_REGS  0
159#endif
160
161/* The host compiler should supply <cpuid.h> to enable runtime features
162   detection, as we're not going to go so far as our own inline assembly.
163   If not available, default values will be assumed.  */
164#if defined(CONFIG_CPUID_H)
165#include "qemu/cpuid.h"
166#endif
167
168/* For 64-bit, we always know that CMOV is available.  */
169#if TCG_TARGET_REG_BITS == 64
170# define have_cmov 1
171#elif defined(CONFIG_CPUID_H)
172static bool have_cmov;
173#else
174# define have_cmov 0
175#endif
176
177/* We need these symbols in tcg-target.h, and we can't properly conditionalize
178   it there.  Therefore we always define the variable.  */
179bool have_bmi1;
180bool have_popcnt;
181bool have_avx1;
182bool have_avx2;
183bool have_avx512bw;
184bool have_avx512dq;
185bool have_avx512vbmi2;
186bool have_avx512vl;
187bool have_movbe;
188
189#ifdef CONFIG_CPUID_H
190static bool have_bmi2;
191static bool have_lzcnt;
192#else
193# define have_bmi2 0
194# define have_lzcnt 0
195#endif
196
197static const tcg_insn_unit *tb_ret_addr;
198
199static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
200                        intptr_t value, intptr_t addend)
201{
202    value += addend;
203    switch(type) {
204    case R_386_PC32:
205        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
206        if (value != (int32_t)value) {
207            return false;
208        }
209        /* FALLTHRU */
210    case R_386_32:
211        tcg_patch32(code_ptr, value);
212        break;
213    case R_386_PC8:
214        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
215        if (value != (int8_t)value) {
216            return false;
217        }
218        tcg_patch8(code_ptr, value);
219        break;
220    default:
221        g_assert_not_reached();
222    }
223    return true;
224}
225
226/* test if a constant matches the constraint */
227static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
228{
229    if (ct & TCG_CT_CONST) {
230        return 1;
231    }
232    if (type == TCG_TYPE_I32) {
233        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
234            return 1;
235        }
236    } else {
237        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
238            return 1;
239        }
240        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
241            return 1;
242        }
243        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
244            return 1;
245        }
246    }
247    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
248        return 1;
249    }
250    return 0;
251}
252
253# define LOWREGMASK(x)	((x) & 7)
254
255#define P_EXT		0x100		/* 0x0f opcode prefix */
256#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
257#define P_DATA16        0x400           /* 0x66 opcode prefix */
258#define P_VEXW          0x1000          /* Set VEX.W = 1 */
259#if TCG_TARGET_REG_BITS == 64
260# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
261# define P_REXB_R       0x2000          /* REG field as byte register */
262# define P_REXB_RM      0x4000          /* R/M field as byte register */
263# define P_GS           0x8000          /* gs segment override */
264#else
265# define P_REXW		0
266# define P_REXB_R	0
267# define P_REXB_RM	0
268# define P_GS           0
269#endif
270#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
271#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
272#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
273#define P_VEXL          0x80000         /* Set VEX.L = 1 */
274#define P_EVEX          0x100000        /* Requires EVEX encoding */
275
276#define OPC_ARITH_EvIz	(0x81)
277#define OPC_ARITH_EvIb	(0x83)
278#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
279#define OPC_ANDN        (0xf2 | P_EXT38)
280#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
281#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
282#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
283#define OPC_BSF         (0xbc | P_EXT)
284#define OPC_BSR         (0xbd | P_EXT)
285#define OPC_BSWAP	(0xc8 | P_EXT)
286#define OPC_CALL_Jz	(0xe8)
287#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
288#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
289#define OPC_DEC_r32	(0x48)
290#define OPC_IMUL_GvEv	(0xaf | P_EXT)
291#define OPC_IMUL_GvEvIb	(0x6b)
292#define OPC_IMUL_GvEvIz	(0x69)
293#define OPC_INC_r32	(0x40)
294#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
295#define OPC_JCC_short	(0x70)		/* ... plus condition code */
296#define OPC_JMP_long	(0xe9)
297#define OPC_JMP_short	(0xeb)
298#define OPC_LEA         (0x8d)
299#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
300#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
301#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
302#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
303#define OPC_MOVB_EvIz   (0xc6)
304#define OPC_MOVL_EvIz	(0xc7)
305#define OPC_MOVL_Iv     (0xb8)
306#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
307#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
308#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
309#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
310#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
311#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
312#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
313#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
314#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
315#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
316#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
317#define OPC_MOVSBL	(0xbe | P_EXT)
318#define OPC_MOVSWL	(0xbf | P_EXT)
319#define OPC_MOVSLQ	(0x63 | P_REXW)
320#define OPC_MOVZBL	(0xb6 | P_EXT)
321#define OPC_MOVZWL	(0xb7 | P_EXT)
322#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
323#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
324#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
325#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
326#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
327#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
328#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
329#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
330#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
331#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
332#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
333#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
334#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
335#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
336#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
337#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
338#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
339#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
340#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
341#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
342#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
343#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
344#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
345#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
346#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
347#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
348#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
349#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
350#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
351#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
352#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
353#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
354#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
355#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
356#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
358#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
359#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
360#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
361#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
362#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
363#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
364#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
365#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
366#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
367#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
368#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
369#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
370#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
371#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
372#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
373#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
374#define OPC_POR         (0xeb | P_EXT | P_DATA16)
375#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
376#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
377#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
378#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
379#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
380#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
381#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
382#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
383#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
384#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
385#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
386#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
387#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
388#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
389#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
390#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
391#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
392#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
393#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
394#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
395#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
396#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
397#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
398#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
399#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
400#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
401#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
402#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
403#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
404#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
405#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
406#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
407#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
408#define OPC_POP_r32	(0x58)
409#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
410#define OPC_PUSH_r32	(0x50)
411#define OPC_PUSH_Iv	(0x68)
412#define OPC_PUSH_Ib	(0x6a)
413#define OPC_RET		(0xc3)
414#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
415#define OPC_SHIFT_1	(0xd1)
416#define OPC_SHIFT_Ib	(0xc1)
417#define OPC_SHIFT_cl	(0xd3)
418#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
419#define OPC_SHUFPS      (0xc6 | P_EXT)
420#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
421#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
422#define OPC_SHRD_Ib     (0xac | P_EXT)
423#define OPC_TESTL	(0x85)
424#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
425#define OPC_UD2         (0x0b | P_EXT)
426#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
427#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
428#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
429#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
430#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
431#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
432#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
433#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
434#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
435#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
436#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
437#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
438#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
439#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
440#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
441#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
443#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
444#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
446#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
447#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
448#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
449#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
450#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
451#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
452#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
453#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
454#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
455#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
456#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
457#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
458#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
459#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
460#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
461#define OPC_VZEROUPPER  (0x77 | P_EXT)
462#define OPC_XCHG_ax_r32	(0x90)
463#define OPC_XCHG_EvGv   (0x87)
464
465#define OPC_GRP3_Eb     (0xf6)
466#define OPC_GRP3_Ev     (0xf7)
467#define OPC_GRP5        (0xff)
468#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
469
470/* Group 1 opcode extensions for 0x80-0x83.
471   These are also used as modifiers for OPC_ARITH.  */
472#define ARITH_ADD 0
473#define ARITH_OR  1
474#define ARITH_ADC 2
475#define ARITH_SBB 3
476#define ARITH_AND 4
477#define ARITH_SUB 5
478#define ARITH_XOR 6
479#define ARITH_CMP 7
480
481/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
482#define SHIFT_ROL 0
483#define SHIFT_ROR 1
484#define SHIFT_SHL 4
485#define SHIFT_SHR 5
486#define SHIFT_SAR 7
487
488/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
489#define EXT3_TESTi 0
490#define EXT3_NOT   2
491#define EXT3_NEG   3
492#define EXT3_MUL   4
493#define EXT3_IMUL  5
494#define EXT3_DIV   6
495#define EXT3_IDIV  7
496
497/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
498#define EXT5_INC_Ev	0
499#define EXT5_DEC_Ev	1
500#define EXT5_CALLN_Ev	2
501#define EXT5_JMPN_Ev	4
502
503/* Condition codes to be added to OPC_JCC_{long,short}.  */
504#define JCC_JMP (-1)
505#define JCC_JO  0x0
506#define JCC_JNO 0x1
507#define JCC_JB  0x2
508#define JCC_JAE 0x3
509#define JCC_JE  0x4
510#define JCC_JNE 0x5
511#define JCC_JBE 0x6
512#define JCC_JA  0x7
513#define JCC_JS  0x8
514#define JCC_JNS 0x9
515#define JCC_JP  0xa
516#define JCC_JNP 0xb
517#define JCC_JL  0xc
518#define JCC_JGE 0xd
519#define JCC_JLE 0xe
520#define JCC_JG  0xf
521
522static const uint8_t tcg_cond_to_jcc[] = {
523    [TCG_COND_EQ] = JCC_JE,
524    [TCG_COND_NE] = JCC_JNE,
525    [TCG_COND_LT] = JCC_JL,
526    [TCG_COND_GE] = JCC_JGE,
527    [TCG_COND_LE] = JCC_JLE,
528    [TCG_COND_GT] = JCC_JG,
529    [TCG_COND_LTU] = JCC_JB,
530    [TCG_COND_GEU] = JCC_JAE,
531    [TCG_COND_LEU] = JCC_JBE,
532    [TCG_COND_GTU] = JCC_JA,
533};
534
535#if TCG_TARGET_REG_BITS == 64
536static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
537{
538    int rex;
539
540    if (opc & P_GS) {
541        tcg_out8(s, 0x65);
542    }
543    if (opc & P_DATA16) {
544        /* We should never be asking for both 16 and 64-bit operation.  */
545        tcg_debug_assert((opc & P_REXW) == 0);
546        tcg_out8(s, 0x66);
547    }
548    if (opc & P_SIMDF3) {
549        tcg_out8(s, 0xf3);
550    } else if (opc & P_SIMDF2) {
551        tcg_out8(s, 0xf2);
552    }
553
554    rex = 0;
555    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
556    rex |= (r & 8) >> 1;                /* REX.R */
557    rex |= (x & 8) >> 2;                /* REX.X */
558    rex |= (rm & 8) >> 3;               /* REX.B */
559
560    /* P_REXB_{R,RM} indicates that the given register is the low byte.
561       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
562       as otherwise the encoding indicates %[abcd]h.  Note that the values
563       that are ORed in merely indicate that the REX byte must be present;
564       those bits get discarded in output.  */
565    rex |= opc & (r >= 4 ? P_REXB_R : 0);
566    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
567
568    if (rex) {
569        tcg_out8(s, (uint8_t)(rex | 0x40));
570    }
571
572    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
573        tcg_out8(s, 0x0f);
574        if (opc & P_EXT38) {
575            tcg_out8(s, 0x38);
576        } else if (opc & P_EXT3A) {
577            tcg_out8(s, 0x3a);
578        }
579    }
580
581    tcg_out8(s, opc);
582}
583#else
584static void tcg_out_opc(TCGContext *s, int opc)
585{
586    if (opc & P_DATA16) {
587        tcg_out8(s, 0x66);
588    }
589    if (opc & P_SIMDF3) {
590        tcg_out8(s, 0xf3);
591    } else if (opc & P_SIMDF2) {
592        tcg_out8(s, 0xf2);
593    }
594    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
595        tcg_out8(s, 0x0f);
596        if (opc & P_EXT38) {
597            tcg_out8(s, 0x38);
598        } else if (opc & P_EXT3A) {
599            tcg_out8(s, 0x3a);
600        }
601    }
602    tcg_out8(s, opc);
603}
604/* Discard the register arguments to tcg_out_opc early, so as not to penalize
605   the 32-bit compilation paths.  This method works with all versions of gcc,
606   whereas relying on optimization may not be able to exclude them.  */
607#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
608#endif
609
610static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
611{
612    tcg_out_opc(s, opc, r, rm, 0);
613    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
614}
615
616static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
617                            int rm, int index)
618{
619    int tmp;
620
621    /* Use the two byte form if possible, which cannot encode
622       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
623    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
624        && ((rm | index) & 8) == 0) {
625        /* Two byte VEX prefix.  */
626        tcg_out8(s, 0xc5);
627
628        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
629    } else {
630        /* Three byte VEX prefix.  */
631        tcg_out8(s, 0xc4);
632
633        /* VEX.m-mmmm */
634        if (opc & P_EXT3A) {
635            tmp = 3;
636        } else if (opc & P_EXT38) {
637            tmp = 2;
638        } else if (opc & P_EXT) {
639            tmp = 1;
640        } else {
641            g_assert_not_reached();
642        }
643        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
644        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
645        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
646        tcg_out8(s, tmp);
647
648        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
649    }
650
651    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
652    /* VEX.pp */
653    if (opc & P_DATA16) {
654        tmp |= 1;                          /* 0x66 */
655    } else if (opc & P_SIMDF3) {
656        tmp |= 2;                          /* 0xf3 */
657    } else if (opc & P_SIMDF2) {
658        tmp |= 3;                          /* 0xf2 */
659    }
660    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
661    tcg_out8(s, tmp);
662    tcg_out8(s, opc);
663}
664
665static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
666                             int rm, int index)
667{
668    /* The entire 4-byte evex prefix; with R' and V' set. */
669    uint32_t p = 0x08041062;
670    int mm, pp;
671
672    tcg_debug_assert(have_avx512vl);
673
674    /* EVEX.mm */
675    if (opc & P_EXT3A) {
676        mm = 3;
677    } else if (opc & P_EXT38) {
678        mm = 2;
679    } else if (opc & P_EXT) {
680        mm = 1;
681    } else {
682        g_assert_not_reached();
683    }
684
685    /* EVEX.pp */
686    if (opc & P_DATA16) {
687        pp = 1;                          /* 0x66 */
688    } else if (opc & P_SIMDF3) {
689        pp = 2;                          /* 0xf3 */
690    } else if (opc & P_SIMDF2) {
691        pp = 3;                          /* 0xf2 */
692    } else {
693        pp = 0;
694    }
695
696    p = deposit32(p, 8, 2, mm);
697    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
698    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
699    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
700    p = deposit32(p, 16, 2, pp);
701    p = deposit32(p, 19, 4, ~v);
702    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
703    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
704
705    tcg_out32(s, p);
706    tcg_out8(s, opc);
707}
708
709static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
710{
711    if (opc & P_EVEX) {
712        tcg_out_evex_opc(s, opc, r, v, rm, 0);
713    } else {
714        tcg_out_vex_opc(s, opc, r, v, rm, 0);
715    }
716    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
717}
718
719/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
720   We handle either RM and INDEX missing with a negative value.  In 64-bit
721   mode for absolute addresses, ~RM is the size of the immediate operand
722   that will follow the instruction.  */
723
724static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
725                               int shift, intptr_t offset)
726{
727    int mod, len;
728
729    if (index < 0 && rm < 0) {
730        if (TCG_TARGET_REG_BITS == 64) {
731            /* Try for a rip-relative addressing mode.  This has replaced
732               the 32-bit-mode absolute addressing encoding.  */
733            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
734            intptr_t disp = offset - pc;
735            if (disp == (int32_t)disp) {
736                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
737                tcg_out32(s, disp);
738                return;
739            }
740
741            /* Try for an absolute address encoding.  This requires the
742               use of the MODRM+SIB encoding and is therefore larger than
743               rip-relative addressing.  */
744            if (offset == (int32_t)offset) {
745                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
746                tcg_out8(s, (4 << 3) | 5);
747                tcg_out32(s, offset);
748                return;
749            }
750
751            /* ??? The memory isn't directly addressable.  */
752            g_assert_not_reached();
753        } else {
754            /* Absolute address.  */
755            tcg_out8(s, (r << 3) | 5);
756            tcg_out32(s, offset);
757            return;
758        }
759    }
760
761    /* Find the length of the immediate addend.  Note that the encoding
762       that would be used for (%ebp) indicates absolute addressing.  */
763    if (rm < 0) {
764        mod = 0, len = 4, rm = 5;
765    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
766        mod = 0, len = 0;
767    } else if (offset == (int8_t)offset) {
768        mod = 0x40, len = 1;
769    } else {
770        mod = 0x80, len = 4;
771    }
772
773    /* Use a single byte MODRM format if possible.  Note that the encoding
774       that would be used for %esp is the escape to the two byte form.  */
775    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
776        /* Single byte MODRM format.  */
777        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
778    } else {
779        /* Two byte MODRM+SIB format.  */
780
781        /* Note that the encoding that would place %esp into the index
782           field indicates no index register.  In 64-bit mode, the REX.X
783           bit counts, so %r12 can be used as the index.  */
784        if (index < 0) {
785            index = 4;
786        } else {
787            tcg_debug_assert(index != TCG_REG_ESP);
788        }
789
790        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
791        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
792    }
793
794    if (len == 1) {
795        tcg_out8(s, offset);
796    } else if (len == 4) {
797        tcg_out32(s, offset);
798    }
799}
800
801static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
802                                     int index, int shift, intptr_t offset)
803{
804    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
805    tcg_out_sib_offset(s, r, rm, index, shift, offset);
806}
807
808static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
809                                         int rm, int index, int shift,
810                                         intptr_t offset)
811{
812    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
813    tcg_out_sib_offset(s, r, rm, index, shift, offset);
814}
815
816/* A simplification of the above with no index or shift.  */
817static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
818                                        int rm, intptr_t offset)
819{
820    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
821}
822
823static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
824                                            int v, int rm, intptr_t offset)
825{
826    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
827}
828
829/* Output an opcode with an expected reference to the constant pool.  */
830static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
831{
832    tcg_out_opc(s, opc, r, 0, 0);
833    /* Absolute for 32-bit, pc-relative for 64-bit.  */
834    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
835    tcg_out32(s, 0);
836}
837
838/* Output an opcode with an expected reference to the constant pool.  */
839static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
840{
841    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
842    /* Absolute for 32-bit, pc-relative for 64-bit.  */
843    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
844    tcg_out32(s, 0);
845}
846
847/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
848static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
849{
850    /* Propagate an opcode prefix, such as P_REXW.  */
851    int ext = subop & ~0x7;
852    subop &= 0x7;
853
854    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
855}
856
857static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
858{
859    int rexw = 0;
860
861    if (arg == ret) {
862        return true;
863    }
864    switch (type) {
865    case TCG_TYPE_I64:
866        rexw = P_REXW;
867        /* fallthru */
868    case TCG_TYPE_I32:
869        if (ret < 16) {
870            if (arg < 16) {
871                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
872            } else {
873                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
874            }
875        } else {
876            if (arg < 16) {
877                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
878            } else {
879                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
880            }
881        }
882        break;
883
884    case TCG_TYPE_V64:
885        tcg_debug_assert(ret >= 16 && arg >= 16);
886        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
887        break;
888    case TCG_TYPE_V128:
889        tcg_debug_assert(ret >= 16 && arg >= 16);
890        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
891        break;
892    case TCG_TYPE_V256:
893        tcg_debug_assert(ret >= 16 && arg >= 16);
894        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
895        break;
896
897    default:
898        g_assert_not_reached();
899    }
900    return true;
901}
902
903static const int avx2_dup_insn[4] = {
904    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
905    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
906};
907
908static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
909                            TCGReg r, TCGReg a)
910{
911    if (have_avx2) {
912        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
913        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
914    } else {
915        switch (vece) {
916        case MO_8:
917            /* ??? With zero in a register, use PSHUFB.  */
918            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
919            a = r;
920            /* FALLTHRU */
921        case MO_16:
922            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
923            a = r;
924            /* FALLTHRU */
925        case MO_32:
926            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
927            /* imm8 operand: all output lanes selected from input lane 0.  */
928            tcg_out8(s, 0);
929            break;
930        case MO_64:
931            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
932            break;
933        default:
934            g_assert_not_reached();
935        }
936    }
937    return true;
938}
939
940static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
941                             TCGReg r, TCGReg base, intptr_t offset)
942{
943    if (have_avx2) {
944        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
945        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
946                                 r, 0, base, offset);
947    } else {
948        switch (vece) {
949        case MO_64:
950            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
951            break;
952        case MO_32:
953            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
954            break;
955        case MO_16:
956            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
957            tcg_out8(s, 0); /* imm8 */
958            tcg_out_dup_vec(s, type, vece, r, r);
959            break;
960        case MO_8:
961            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
962            tcg_out8(s, 0); /* imm8 */
963            tcg_out_dup_vec(s, type, vece, r, r);
964            break;
965        default:
966            g_assert_not_reached();
967        }
968    }
969    return true;
970}
971
972static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
973                             TCGReg ret, int64_t arg)
974{
975    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
976
977    if (arg == 0) {
978        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
979        return;
980    }
981    if (arg == -1) {
982        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
983        return;
984    }
985
986    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
987        if (have_avx2) {
988            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
989        } else {
990            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
991        }
992        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
993    } else {
994        if (type == TCG_TYPE_V64) {
995            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
996        } else if (have_avx2) {
997            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
998        } else {
999            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1000        }
1001        if (TCG_TARGET_REG_BITS == 64) {
1002            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1003        } else {
1004            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1005        }
1006    }
1007}
1008
1009static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1010                             TCGReg ret, tcg_target_long arg)
1011{
1012    if (arg == 0) {
1013        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1014        return;
1015    }
1016    if (arg == -1) {
1017        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1018        return;
1019    }
1020
1021    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1022    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1023    if (TCG_TARGET_REG_BITS == 64) {
1024        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1025    } else {
1026        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1027    }
1028}
1029
1030static void tcg_out_movi_int(TCGContext *s, TCGType type,
1031                             TCGReg ret, tcg_target_long arg)
1032{
1033    tcg_target_long diff;
1034
1035    if (arg == 0) {
1036        tgen_arithr(s, ARITH_XOR, ret, ret);
1037        return;
1038    }
1039    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1040        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1041        tcg_out32(s, arg);
1042        return;
1043    }
1044    if (arg == (int32_t)arg) {
1045        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1046        tcg_out32(s, arg);
1047        return;
1048    }
1049
1050    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1051    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1052    if (diff == (int32_t)diff) {
1053        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1054        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1055        tcg_out32(s, diff);
1056        return;
1057    }
1058
1059    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1060    tcg_out64(s, arg);
1061}
1062
1063static void tcg_out_movi(TCGContext *s, TCGType type,
1064                         TCGReg ret, tcg_target_long arg)
1065{
1066    switch (type) {
1067    case TCG_TYPE_I32:
1068#if TCG_TARGET_REG_BITS == 64
1069    case TCG_TYPE_I64:
1070#endif
1071        if (ret < 16) {
1072            tcg_out_movi_int(s, type, ret, arg);
1073        } else {
1074            tcg_out_movi_vec(s, type, ret, arg);
1075        }
1076        break;
1077    default:
1078        g_assert_not_reached();
1079    }
1080}
1081
1082static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1083{
1084    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1085    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1086    return true;
1087}
1088
1089static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1090                             tcg_target_long imm)
1091{
1092    /* This function is only used for passing structs by reference. */
1093    tcg_debug_assert(imm == (int32_t)imm);
1094    tcg_out_modrm_offset(s, OPC_LEA, rd, rs, imm);
1095}
1096
1097static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1098{
1099    if (val == (int8_t)val) {
1100        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1101        tcg_out8(s, val);
1102    } else if (val == (int32_t)val) {
1103        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1104        tcg_out32(s, val);
1105    } else {
1106        g_assert_not_reached();
1107    }
1108}
1109
1110static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1111{
1112    /* Given the strength of x86 memory ordering, we only need care for
1113       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1114       faster than "mfence", so don't bother with the sse insn.  */
1115    if (a0 & TCG_MO_ST_LD) {
1116        tcg_out8(s, 0xf0);
1117        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1118        tcg_out8(s, 0);
1119    }
1120}
1121
1122static inline void tcg_out_push(TCGContext *s, int reg)
1123{
1124    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1125}
1126
1127static inline void tcg_out_pop(TCGContext *s, int reg)
1128{
1129    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1130}
1131
1132static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1133                       TCGReg arg1, intptr_t arg2)
1134{
1135    switch (type) {
1136    case TCG_TYPE_I32:
1137        if (ret < 16) {
1138            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1139        } else {
1140            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1141        }
1142        break;
1143    case TCG_TYPE_I64:
1144        if (ret < 16) {
1145            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1146            break;
1147        }
1148        /* FALLTHRU */
1149    case TCG_TYPE_V64:
1150        /* There is no instruction that can validate 8-byte alignment.  */
1151        tcg_debug_assert(ret >= 16);
1152        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1153        break;
1154    case TCG_TYPE_V128:
1155        /*
1156         * The gvec infrastructure is asserts that v128 vector loads
1157         * and stores use a 16-byte aligned offset.  Validate that the
1158         * final pointer is aligned by using an insn that will SIGSEGV.
1159         */
1160        tcg_debug_assert(ret >= 16);
1161        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1162        break;
1163    case TCG_TYPE_V256:
1164        /*
1165         * The gvec infrastructure only requires 16-byte alignment,
1166         * so here we must use an unaligned load.
1167         */
1168        tcg_debug_assert(ret >= 16);
1169        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1170                                 ret, 0, arg1, arg2);
1171        break;
1172    default:
1173        g_assert_not_reached();
1174    }
1175}
1176
1177static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1178                       TCGReg arg1, intptr_t arg2)
1179{
1180    switch (type) {
1181    case TCG_TYPE_I32:
1182        if (arg < 16) {
1183            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1184        } else {
1185            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1186        }
1187        break;
1188    case TCG_TYPE_I64:
1189        if (arg < 16) {
1190            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1191            break;
1192        }
1193        /* FALLTHRU */
1194    case TCG_TYPE_V64:
1195        /* There is no instruction that can validate 8-byte alignment.  */
1196        tcg_debug_assert(arg >= 16);
1197        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1198        break;
1199    case TCG_TYPE_V128:
1200        /*
1201         * The gvec infrastructure is asserts that v128 vector loads
1202         * and stores use a 16-byte aligned offset.  Validate that the
1203         * final pointer is aligned by using an insn that will SIGSEGV.
1204         *
1205         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1206         * for _WIN64, which must have SSE2 but may not have AVX.
1207         */
1208        tcg_debug_assert(arg >= 16);
1209        if (have_avx1) {
1210            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1211        } else {
1212            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1213        }
1214        break;
1215    case TCG_TYPE_V256:
1216        /*
1217         * The gvec infrastructure only requires 16-byte alignment,
1218         * so here we must use an unaligned store.
1219         */
1220        tcg_debug_assert(arg >= 16);
1221        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1222                                 arg, 0, arg1, arg2);
1223        break;
1224    default:
1225        g_assert_not_reached();
1226    }
1227}
1228
1229static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1230                        TCGReg base, intptr_t ofs)
1231{
1232    int rexw = 0;
1233    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1234        if (val != (int32_t)val) {
1235            return false;
1236        }
1237        rexw = P_REXW;
1238    } else if (type != TCG_TYPE_I32) {
1239        return false;
1240    }
1241    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1242    tcg_out32(s, val);
1243    return true;
1244}
1245
1246static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1247{
1248    /* Propagate an opcode prefix, such as P_DATA16.  */
1249    int ext = subopc & ~0x7;
1250    subopc &= 0x7;
1251
1252    if (count == 1) {
1253        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1254    } else {
1255        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1256        tcg_out8(s, count);
1257    }
1258}
1259
1260static inline void tcg_out_bswap32(TCGContext *s, int reg)
1261{
1262    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1263}
1264
1265static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1266{
1267    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1268}
1269
1270static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1271{
1272    /* movzbl */
1273    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1274    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1275}
1276
1277static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1278{
1279    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1280    /* movsbl */
1281    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1282    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1283}
1284
1285static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1286{
1287    /* movzwl */
1288    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1289}
1290
1291static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1292{
1293    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1294    /* movsw[lq] */
1295    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1296}
1297
1298static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1299{
1300    /* 32-bit mov zero extends.  */
1301    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1302}
1303
1304static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1305{
1306    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1307    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1308}
1309
1310static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1311{
1312    tcg_out_ext32s(s, dest, src);
1313}
1314
1315static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1316{
1317    tcg_out_ext32u(s, dest, src);
1318}
1319
1320static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1321{
1322    tcg_out_ext32u(s, dest, src);
1323}
1324
1325static inline void tcg_out_bswap64(TCGContext *s, int reg)
1326{
1327    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1328}
1329
1330static void tgen_arithi(TCGContext *s, int c, int r0,
1331                        tcg_target_long val, int cf)
1332{
1333    int rexw = 0;
1334
1335    if (TCG_TARGET_REG_BITS == 64) {
1336        rexw = c & -8;
1337        c &= 7;
1338    }
1339
1340    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1341       partial flags update stalls on Pentium4 and are not recommended
1342       by current Intel optimization manuals.  */
1343    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1344        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1345        if (TCG_TARGET_REG_BITS == 64) {
1346            /* The single-byte increment encodings are re-tasked as the
1347               REX prefixes.  Use the MODRM encoding.  */
1348            tcg_out_modrm(s, OPC_GRP5 + rexw,
1349                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1350        } else {
1351            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1352        }
1353        return;
1354    }
1355
1356    if (c == ARITH_AND) {
1357        if (TCG_TARGET_REG_BITS == 64) {
1358            if (val == 0xffffffffu) {
1359                tcg_out_ext32u(s, r0, r0);
1360                return;
1361            }
1362            if (val == (uint32_t)val) {
1363                /* AND with no high bits set can use a 32-bit operation.  */
1364                rexw = 0;
1365            }
1366        }
1367        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1368            tcg_out_ext8u(s, r0, r0);
1369            return;
1370        }
1371        if (val == 0xffffu) {
1372            tcg_out_ext16u(s, r0, r0);
1373            return;
1374        }
1375    }
1376
1377    if (val == (int8_t)val) {
1378        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1379        tcg_out8(s, val);
1380        return;
1381    }
1382    if (rexw == 0 || val == (int32_t)val) {
1383        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1384        tcg_out32(s, val);
1385        return;
1386    }
1387
1388    g_assert_not_reached();
1389}
1390
1391static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1392{
1393    if (val != 0) {
1394        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1395    }
1396}
1397
1398/* Set SMALL to force a short forward branch.  */
1399static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1400{
1401    int32_t val, val1;
1402
1403    if (l->has_value) {
1404        val = tcg_pcrel_diff(s, l->u.value_ptr);
1405        val1 = val - 2;
1406        if ((int8_t)val1 == val1) {
1407            if (opc == -1) {
1408                tcg_out8(s, OPC_JMP_short);
1409            } else {
1410                tcg_out8(s, OPC_JCC_short + opc);
1411            }
1412            tcg_out8(s, val1);
1413        } else {
1414            tcg_debug_assert(!small);
1415            if (opc == -1) {
1416                tcg_out8(s, OPC_JMP_long);
1417                tcg_out32(s, val - 5);
1418            } else {
1419                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1420                tcg_out32(s, val - 6);
1421            }
1422        }
1423    } else if (small) {
1424        if (opc == -1) {
1425            tcg_out8(s, OPC_JMP_short);
1426        } else {
1427            tcg_out8(s, OPC_JCC_short + opc);
1428        }
1429        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1430        s->code_ptr += 1;
1431    } else {
1432        if (opc == -1) {
1433            tcg_out8(s, OPC_JMP_long);
1434        } else {
1435            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1436        }
1437        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1438        s->code_ptr += 4;
1439    }
1440}
1441
1442static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1443                        int const_arg2, int rexw)
1444{
1445    if (const_arg2) {
1446        if (arg2 == 0) {
1447            /* test r, r */
1448            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1449        } else {
1450            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1451        }
1452    } else {
1453        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1454    }
1455}
1456
1457static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1458                             TCGArg arg1, TCGArg arg2, int const_arg2,
1459                             TCGLabel *label, int small)
1460{
1461    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1462    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1463}
1464
1465#if TCG_TARGET_REG_BITS == 64
1466static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1467                             TCGArg arg1, TCGArg arg2, int const_arg2,
1468                             TCGLabel *label, int small)
1469{
1470    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1471    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1472}
1473#else
1474/* XXX: we implement it at the target level to avoid having to
1475   handle cross basic blocks temporaries */
1476static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1477                            const int *const_args, int small)
1478{
1479    TCGLabel *label_next = gen_new_label();
1480    TCGLabel *label_this = arg_label(args[5]);
1481
1482    switch(args[4]) {
1483    case TCG_COND_EQ:
1484        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1485                         label_next, 1);
1486        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1487                         label_this, small);
1488        break;
1489    case TCG_COND_NE:
1490        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1491                         label_this, small);
1492        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1493                         label_this, small);
1494        break;
1495    case TCG_COND_LT:
1496        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1497                         label_this, small);
1498        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1500                         label_this, small);
1501        break;
1502    case TCG_COND_LE:
1503        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1504                         label_this, small);
1505        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1507                         label_this, small);
1508        break;
1509    case TCG_COND_GT:
1510        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1511                         label_this, small);
1512        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1514                         label_this, small);
1515        break;
1516    case TCG_COND_GE:
1517        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1518                         label_this, small);
1519        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1521                         label_this, small);
1522        break;
1523    case TCG_COND_LTU:
1524        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1525                         label_this, small);
1526        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1527        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1528                         label_this, small);
1529        break;
1530    case TCG_COND_LEU:
1531        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1532                         label_this, small);
1533        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1534        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1535                         label_this, small);
1536        break;
1537    case TCG_COND_GTU:
1538        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1539                         label_this, small);
1540        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1541        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1542                         label_this, small);
1543        break;
1544    case TCG_COND_GEU:
1545        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1546                         label_this, small);
1547        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1548        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1549                         label_this, small);
1550        break;
1551    default:
1552        g_assert_not_reached();
1553    }
1554    tcg_out_label(s, label_next);
1555}
1556#endif
1557
1558static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1559                              TCGArg arg1, TCGArg arg2, int const_arg2)
1560{
1561    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1562    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1563    tcg_out_ext8u(s, dest, dest);
1564}
1565
1566#if TCG_TARGET_REG_BITS == 64
1567static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1568                              TCGArg arg1, TCGArg arg2, int const_arg2)
1569{
1570    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1571    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1572    tcg_out_ext8u(s, dest, dest);
1573}
1574#else
1575static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1576                             const int *const_args)
1577{
1578    TCGArg new_args[6];
1579    TCGLabel *label_true, *label_over;
1580
1581    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1582
1583    if (args[0] == args[1] || args[0] == args[2]
1584        || (!const_args[3] && args[0] == args[3])
1585        || (!const_args[4] && args[0] == args[4])) {
1586        /* When the destination overlaps with one of the argument
1587           registers, don't do anything tricky.  */
1588        label_true = gen_new_label();
1589        label_over = gen_new_label();
1590
1591        new_args[5] = label_arg(label_true);
1592        tcg_out_brcond2(s, new_args, const_args+1, 1);
1593
1594        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1595        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1596        tcg_out_label(s, label_true);
1597
1598        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1599        tcg_out_label(s, label_over);
1600    } else {
1601        /* When the destination does not overlap one of the arguments,
1602           clear the destination first, jump if cond false, and emit an
1603           increment in the true case.  This results in smaller code.  */
1604
1605        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1606
1607        label_over = gen_new_label();
1608        new_args[4] = tcg_invert_cond(new_args[4]);
1609        new_args[5] = label_arg(label_over);
1610        tcg_out_brcond2(s, new_args, const_args+1, 1);
1611
1612        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1613        tcg_out_label(s, label_over);
1614    }
1615}
1616#endif
1617
1618static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1619                         TCGReg dest, TCGReg v1)
1620{
1621    if (have_cmov) {
1622        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1623    } else {
1624        TCGLabel *over = gen_new_label();
1625        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1626        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1627        tcg_out_label(s, over);
1628    }
1629}
1630
1631static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1632                              TCGReg c1, TCGArg c2, int const_c2,
1633                              TCGReg v1)
1634{
1635    tcg_out_cmp(s, c1, c2, const_c2, 0);
1636    tcg_out_cmov(s, cond, 0, dest, v1);
1637}
1638
1639#if TCG_TARGET_REG_BITS == 64
1640static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1641                              TCGReg c1, TCGArg c2, int const_c2,
1642                              TCGReg v1)
1643{
1644    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1645    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1646}
1647#endif
1648
1649static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1650                        TCGArg arg2, bool const_a2)
1651{
1652    if (have_bmi1) {
1653        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1654        if (const_a2) {
1655            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1656        } else {
1657            tcg_debug_assert(dest != arg2);
1658            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1659        }
1660    } else {
1661        tcg_debug_assert(dest != arg2);
1662        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1663        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1664    }
1665}
1666
1667static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1668                        TCGArg arg2, bool const_a2)
1669{
1670    if (have_lzcnt) {
1671        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1672        if (const_a2) {
1673            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1674        } else {
1675            tcg_debug_assert(dest != arg2);
1676            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1677        }
1678    } else {
1679        tcg_debug_assert(!const_a2);
1680        tcg_debug_assert(dest != arg1);
1681        tcg_debug_assert(dest != arg2);
1682
1683        /* Recall that the output of BSR is the index not the count.  */
1684        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1685        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1686
1687        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1688        tcg_out_cmp(s, arg1, 0, 1, rexw);
1689        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1690    }
1691}
1692
1693static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1694{
1695    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1696
1697    if (disp == (int32_t)disp) {
1698        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1699        tcg_out32(s, disp);
1700    } else {
1701        /* rip-relative addressing into the constant pool.
1702           This is 6 + 8 = 14 bytes, as compared to using an
1703           immediate load 10 + 6 = 16 bytes, plus we may
1704           be able to re-use the pool constant for more calls.  */
1705        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1706        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1707        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1708        tcg_out32(s, 0);
1709    }
1710}
1711
1712static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1713                         const TCGHelperInfo *info)
1714{
1715    tcg_out_branch(s, 1, dest);
1716
1717#ifndef _WIN32
1718    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1719        /*
1720         * The sysv i386 abi for struct return places a reference as the
1721         * first argument of the stack, and pops that argument with the
1722         * return statement.  Since we want to retain the aligned stack
1723         * pointer for the callee, we do not want to actually push that
1724         * argument before the call but rely on the normal store to the
1725         * stack slot.  But we do need to compensate for the pop in order
1726         * to reset our correct stack pointer value.
1727         * Pushing a garbage value back onto the stack is quickest.
1728         */
1729        tcg_out_push(s, TCG_REG_EAX);
1730    }
1731#endif
1732}
1733
1734static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1735{
1736    tcg_out_branch(s, 0, dest);
1737}
1738
1739static void tcg_out_nopn(TCGContext *s, int n)
1740{
1741    int i;
1742    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1743     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1744     * duplicate prefix, and all of the interesting recent cores can
1745     * decode and discard the duplicates in a single cycle.
1746     */
1747    tcg_debug_assert(n >= 1);
1748    for (i = 1; i < n; ++i) {
1749        tcg_out8(s, 0x66);
1750    }
1751    tcg_out8(s, 0x90);
1752}
1753
1754/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1755static void __attribute__((unused))
1756tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1757{
1758    /*
1759     * This is used for testing alignment, so we can usually use testb.
1760     * For i686, we have to use testl for %esi/%edi.
1761     */
1762    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1763        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1764        tcg_out8(s, i);
1765    } else {
1766        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1767        tcg_out32(s, i);
1768    }
1769}
1770
1771typedef struct {
1772    TCGReg base;
1773    int index;
1774    int ofs;
1775    int seg;
1776} HostAddress;
1777
1778#if defined(CONFIG_SOFTMMU)
1779/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1780 *                                     int mmu_idx, uintptr_t ra)
1781 */
1782static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1783    [MO_UB]   = helper_ret_ldub_mmu,
1784    [MO_LEUW] = helper_le_lduw_mmu,
1785    [MO_LEUL] = helper_le_ldul_mmu,
1786    [MO_LEUQ] = helper_le_ldq_mmu,
1787    [MO_BEUW] = helper_be_lduw_mmu,
1788    [MO_BEUL] = helper_be_ldul_mmu,
1789    [MO_BEUQ] = helper_be_ldq_mmu,
1790};
1791
1792/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1793 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1794 */
1795static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1796    [MO_UB]   = helper_ret_stb_mmu,
1797    [MO_LEUW] = helper_le_stw_mmu,
1798    [MO_LEUL] = helper_le_stl_mmu,
1799    [MO_LEUQ] = helper_le_stq_mmu,
1800    [MO_BEUW] = helper_be_stw_mmu,
1801    [MO_BEUL] = helper_be_stl_mmu,
1802    [MO_BEUQ] = helper_be_stq_mmu,
1803};
1804
1805/* Perform the TLB load and compare.
1806
1807   Inputs:
1808   ADDRLO and ADDRHI contain the low and high part of the address.
1809
1810   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1811
1812   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1813   This should be offsetof addr_read or addr_write.
1814
1815   Outputs:
1816   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1817   positions of the displacements of forward jumps to the TLB miss case.
1818
1819   Second argument register is loaded with the low part of the address.
1820   In the TLB hit case, it has been adjusted as indicated by the TLB
1821   and so is a host address.  In the TLB miss case, it continues to
1822   hold a guest address.
1823
1824   First argument register is clobbered.  */
1825
1826static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1827                                    int mem_index, MemOp opc,
1828                                    tcg_insn_unit **label_ptr, int which)
1829{
1830    TCGType ttype = TCG_TYPE_I32;
1831    TCGType tlbtype = TCG_TYPE_I32;
1832    int trexw = 0, hrexw = 0, tlbrexw = 0;
1833    unsigned a_bits = get_alignment_bits(opc);
1834    unsigned s_bits = opc & MO_SIZE;
1835    unsigned a_mask = (1 << a_bits) - 1;
1836    unsigned s_mask = (1 << s_bits) - 1;
1837    target_ulong tlb_mask;
1838
1839    if (TCG_TARGET_REG_BITS == 64) {
1840        if (TARGET_LONG_BITS == 64) {
1841            ttype = TCG_TYPE_I64;
1842            trexw = P_REXW;
1843        }
1844        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1845            hrexw = P_REXW;
1846            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1847                tlbtype = TCG_TYPE_I64;
1848                tlbrexw = P_REXW;
1849            }
1850        }
1851    }
1852
1853    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1854    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1855                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1856
1857    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1858                         TLB_MASK_TABLE_OFS(mem_index) +
1859                         offsetof(CPUTLBDescFast, mask));
1860
1861    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1862                         TLB_MASK_TABLE_OFS(mem_index) +
1863                         offsetof(CPUTLBDescFast, table));
1864
1865    /* If the required alignment is at least as large as the access, simply
1866       copy the address and mask.  For lesser alignments, check that we don't
1867       cross pages for the complete access.  */
1868    if (a_bits >= s_bits) {
1869        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1870    } else {
1871        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1872                             addrlo, s_mask - a_mask);
1873    }
1874    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1875    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1876
1877    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1878    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1879                         TCG_REG_L1, TCG_REG_L0, which);
1880
1881    /* Prepare for both the fast path add of the tlb addend, and the slow
1882       path function argument setup.  */
1883    tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1884
1885    /* jne slow_path */
1886    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1887    label_ptr[0] = s->code_ptr;
1888    s->code_ptr += 4;
1889
1890    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1891        /* cmp 4(TCG_REG_L0), addrhi */
1892        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, which + 4);
1893
1894        /* jne slow_path */
1895        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1896        label_ptr[1] = s->code_ptr;
1897        s->code_ptr += 4;
1898    }
1899
1900    /* TLB Hit.  */
1901
1902    /* add addend(TCG_REG_L0), TCG_REG_L1 */
1903    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L1, TCG_REG_L0,
1904                         offsetof(CPUTLBEntry, addend));
1905}
1906
1907/*
1908 * Record the context of a call to the out of line helper code for the slow path
1909 * for a load or store, so that we can later generate the correct helper code
1910 */
1911static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
1912                                TCGType type, MemOpIdx oi,
1913                                TCGReg datalo, TCGReg datahi,
1914                                TCGReg addrlo, TCGReg addrhi,
1915                                tcg_insn_unit *raddr,
1916                                tcg_insn_unit **label_ptr)
1917{
1918    TCGLabelQemuLdst *label = new_ldst_label(s);
1919
1920    label->is_ld = is_ld;
1921    label->oi = oi;
1922    label->type = type;
1923    label->datalo_reg = datalo;
1924    label->datahi_reg = datahi;
1925    label->addrlo_reg = addrlo;
1926    label->addrhi_reg = addrhi;
1927    label->raddr = tcg_splitwx_to_rx(raddr);
1928    label->label_ptr[0] = label_ptr[0];
1929    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1930        label->label_ptr[1] = label_ptr[1];
1931    }
1932}
1933
1934/*
1935 * Generate code for the slow path for a load at the end of block
1936 */
1937static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1938{
1939    MemOpIdx oi = l->oi;
1940    MemOp opc = get_memop(oi);
1941    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1942
1943    /* resolve label address */
1944    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1945    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1946        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1947    }
1948
1949    if (TCG_TARGET_REG_BITS == 32) {
1950        int ofs = 0;
1951
1952        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1953        ofs += 4;
1954
1955        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1956        ofs += 4;
1957
1958        if (TARGET_LONG_BITS == 64) {
1959            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1960            ofs += 4;
1961        }
1962
1963        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1964        ofs += 4;
1965
1966        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1967    } else {
1968        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1969        /* The second argument is already loaded with addrlo.  */
1970        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1971        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1972                     (uintptr_t)l->raddr);
1973    }
1974
1975    tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1976
1977    if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
1978        TCGMovExtend ext[2] = {
1979            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
1980              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
1981            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
1982              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
1983        };
1984        tcg_out_movext2(s, &ext[0], &ext[1], -1);
1985    } else {
1986        tcg_out_movext(s, l->type, l->datalo_reg,
1987                       TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
1988    }
1989
1990    /* Jump to the code corresponding to next IR of qemu_st */
1991    tcg_out_jmp(s, l->raddr);
1992    return true;
1993}
1994
1995/*
1996 * Generate code for the slow path for a store at the end of block
1997 */
1998static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1999{
2000    MemOpIdx oi = l->oi;
2001    MemOp opc = get_memop(oi);
2002    MemOp s_bits = opc & MO_SIZE;
2003    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2004    TCGReg retaddr;
2005
2006    /* resolve label address */
2007    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2008    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
2009        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2010    }
2011
2012    if (TCG_TARGET_REG_BITS == 32) {
2013        int ofs = 0;
2014
2015        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
2016        ofs += 4;
2017
2018        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
2019        ofs += 4;
2020
2021        if (TARGET_LONG_BITS == 64) {
2022            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
2023            ofs += 4;
2024        }
2025
2026        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
2027        ofs += 4;
2028
2029        if (s_bits == MO_64) {
2030            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
2031            ofs += 4;
2032        }
2033
2034        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
2035        ofs += 4;
2036
2037        retaddr = TCG_REG_EAX;
2038        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
2039        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
2040    } else {
2041        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
2042        /* The second argument is already loaded with addrlo.  */
2043        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
2044                    tcg_target_call_iarg_regs[2], l->datalo_reg);
2045        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
2046
2047        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
2048            retaddr = tcg_target_call_iarg_regs[4];
2049            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
2050        } else {
2051            retaddr = TCG_REG_RAX;
2052            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
2053            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
2054                       TCG_TARGET_CALL_STACK_OFFSET);
2055        }
2056    }
2057
2058    /* "Tail call" to the helper, with the return address back inline.  */
2059    tcg_out_push(s, retaddr);
2060    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
2061    return true;
2062}
2063#else
2064
2065static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
2066                                   TCGReg addrhi, unsigned a_bits)
2067{
2068    unsigned a_mask = (1 << a_bits) - 1;
2069    TCGLabelQemuLdst *label;
2070
2071    tcg_out_testi(s, addrlo, a_mask);
2072    /* jne slow_path */
2073    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2074
2075    label = new_ldst_label(s);
2076    label->is_ld = is_ld;
2077    label->addrlo_reg = addrlo;
2078    label->addrhi_reg = addrhi;
2079    label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
2080    label->label_ptr[0] = s->code_ptr;
2081
2082    s->code_ptr += 4;
2083}
2084
2085static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
2086{
2087    /* resolve label address */
2088    tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
2089
2090    if (TCG_TARGET_REG_BITS == 32) {
2091        int ofs = 0;
2092
2093        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
2094        ofs += 4;
2095
2096        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
2097        ofs += 4;
2098        if (TARGET_LONG_BITS == 64) {
2099            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
2100            ofs += 4;
2101        }
2102
2103        tcg_out_pushi(s, (uintptr_t)l->raddr);
2104    } else {
2105        tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
2106                    l->addrlo_reg);
2107        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
2108
2109        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
2110        tcg_out_push(s, TCG_REG_RAX);
2111    }
2112
2113    /* "Tail call" to the helper, with the return address back inline. */
2114    tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
2115                                  : helper_unaligned_st));
2116    return true;
2117}
2118
2119static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2120{
2121    return tcg_out_fail_alignment(s, l);
2122}
2123
2124static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2125{
2126    return tcg_out_fail_alignment(s, l);
2127}
2128
2129static HostAddress x86_guest_base = {
2130    .index = -1
2131};
2132
2133#if defined(__x86_64__) && defined(__linux__)
2134# include <asm/prctl.h>
2135# include <sys/prctl.h>
2136int arch_prctl(int code, unsigned long addr);
2137static inline int setup_guest_base_seg(void)
2138{
2139    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2140        return P_GS;
2141    }
2142    return 0;
2143}
2144#elif defined(__x86_64__) && \
2145      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2146# include <machine/sysarch.h>
2147static inline int setup_guest_base_seg(void)
2148{
2149    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2150        return P_GS;
2151    }
2152    return 0;
2153}
2154#else
2155static inline int setup_guest_base_seg(void)
2156{
2157    return 0;
2158}
2159#endif /* setup_guest_base_seg */
2160#endif /* SOFTMMU */
2161
2162static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2163                                   HostAddress h, TCGType type, MemOp memop)
2164{
2165    bool use_movbe = false;
2166    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2167    int movop = OPC_MOVL_GvEv;
2168
2169    /* Do big-endian loads with movbe.  */
2170    if (memop & MO_BSWAP) {
2171        tcg_debug_assert(have_movbe);
2172        use_movbe = true;
2173        movop = OPC_MOVBE_GyMy;
2174    }
2175
2176    switch (memop & MO_SSIZE) {
2177    case MO_UB:
2178        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2179                                 h.base, h.index, 0, h.ofs);
2180        break;
2181    case MO_SB:
2182        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2183                                 h.base, h.index, 0, h.ofs);
2184        break;
2185    case MO_UW:
2186        if (use_movbe) {
2187            /* There is no extending movbe; only low 16-bits are modified.  */
2188            if (datalo != h.base && datalo != h.index) {
2189                /* XOR breaks dependency chains.  */
2190                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2191                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2192                                         datalo, h.base, h.index, 0, h.ofs);
2193            } else {
2194                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2195                                         datalo, h.base, h.index, 0, h.ofs);
2196                tcg_out_ext16u(s, datalo, datalo);
2197            }
2198        } else {
2199            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2200                                     h.base, h.index, 0, h.ofs);
2201        }
2202        break;
2203    case MO_SW:
2204        if (use_movbe) {
2205            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2206                                     datalo, h.base, h.index, 0, h.ofs);
2207            tcg_out_ext16s(s, type, datalo, datalo);
2208        } else {
2209            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2210                                     datalo, h.base, h.index, 0, h.ofs);
2211        }
2212        break;
2213    case MO_UL:
2214        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2215                                 h.base, h.index, 0, h.ofs);
2216        break;
2217#if TCG_TARGET_REG_BITS == 64
2218    case MO_SL:
2219        if (use_movbe) {
2220            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2221                                     h.base, h.index, 0, h.ofs);
2222            tcg_out_ext32s(s, datalo, datalo);
2223        } else {
2224            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2225                                     h.base, h.index, 0, h.ofs);
2226        }
2227        break;
2228#endif
2229    case MO_UQ:
2230        if (TCG_TARGET_REG_BITS == 64) {
2231            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2232                                     h.base, h.index, 0, h.ofs);
2233            break;
2234        }
2235        if (use_movbe) {
2236            TCGReg t = datalo;
2237            datalo = datahi;
2238            datahi = t;
2239        }
2240        if (h.base == datalo || h.index == datalo) {
2241            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2242                                     h.base, h.index, 0, h.ofs);
2243            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2244            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2245        } else {
2246            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2247                                     h.base, h.index, 0, h.ofs);
2248            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2249                                     h.base, h.index, 0, h.ofs + 4);
2250        }
2251        break;
2252    default:
2253        g_assert_not_reached();
2254    }
2255}
2256
2257static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2258                            TCGReg addrlo, TCGReg addrhi,
2259                            MemOpIdx oi, TCGType data_type)
2260{
2261    MemOp opc = get_memop(oi);
2262    HostAddress h;
2263
2264#if defined(CONFIG_SOFTMMU)
2265    tcg_insn_unit *label_ptr[2];
2266
2267    tcg_out_tlb_load(s, addrlo, addrhi, get_mmuidx(oi), opc,
2268                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2269
2270    /* TLB Hit.  */
2271    h.base = TCG_REG_L1;
2272    h.index = -1;
2273    h.ofs = 0;
2274    h.seg = 0;
2275    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, opc);
2276
2277    /* Record the current context of a load into ldst label */
2278    add_qemu_ldst_label(s, true, data_type, oi, datalo, datahi,
2279                        addrlo, addrhi, s->code_ptr, label_ptr);
2280#else
2281    unsigned a_bits = get_alignment_bits(opc);
2282    if (a_bits) {
2283        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
2284    }
2285
2286    h = x86_guest_base;
2287    h.base = addrlo;
2288    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, opc);
2289#endif
2290}
2291
2292static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2293                                   HostAddress h, MemOp memop)
2294{
2295    bool use_movbe = false;
2296    int movop = OPC_MOVL_EvGv;
2297
2298    /*
2299     * Do big-endian stores with movbe or softmmu.
2300     * User-only without movbe will have its swapping done generically.
2301     */
2302    if (memop & MO_BSWAP) {
2303        tcg_debug_assert(have_movbe);
2304        use_movbe = true;
2305        movop = OPC_MOVBE_MyGy;
2306    }
2307
2308    switch (memop & MO_SIZE) {
2309    case MO_8:
2310        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2311        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2312        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2313                                 datalo, h.base, h.index, 0, h.ofs);
2314        break;
2315    case MO_16:
2316        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2317                                 h.base, h.index, 0, h.ofs);
2318        break;
2319    case MO_32:
2320        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2321                                 h.base, h.index, 0, h.ofs);
2322        break;
2323    case MO_64:
2324        if (TCG_TARGET_REG_BITS == 64) {
2325            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2326                                     h.base, h.index, 0, h.ofs);
2327        } else {
2328            if (use_movbe) {
2329                TCGReg t = datalo;
2330                datalo = datahi;
2331                datahi = t;
2332            }
2333            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2334                                     h.base, h.index, 0, h.ofs);
2335            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2336                                     h.base, h.index, 0, h.ofs + 4);
2337        }
2338        break;
2339    default:
2340        g_assert_not_reached();
2341    }
2342}
2343
2344static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2345                            TCGReg addrlo, TCGReg addrhi,
2346                            MemOpIdx oi, TCGType data_type)
2347{
2348    MemOp opc = get_memop(oi);
2349    HostAddress h;
2350
2351#if defined(CONFIG_SOFTMMU)
2352    tcg_insn_unit *label_ptr[2];
2353
2354    tcg_out_tlb_load(s, addrlo, addrhi, get_mmuidx(oi), opc,
2355                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2356
2357    /* TLB Hit.  */
2358    h.base = TCG_REG_L1;
2359    h.index = -1;
2360    h.ofs = 0;
2361    h.seg = 0;
2362    tcg_out_qemu_st_direct(s, datalo, datahi, h, opc);
2363
2364    /* Record the current context of a store into ldst label */
2365    add_qemu_ldst_label(s, false, data_type, oi, datalo, datahi,
2366                        addrlo, addrhi, s->code_ptr, label_ptr);
2367#else
2368    unsigned a_bits = get_alignment_bits(opc);
2369    if (a_bits) {
2370        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
2371    }
2372
2373    h = x86_guest_base;
2374    h.base = addrlo;
2375
2376    tcg_out_qemu_st_direct(s, datalo, datahi, h, opc);
2377#endif
2378}
2379
2380static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2381{
2382    /* Reuse the zeroing that exists for goto_ptr.  */
2383    if (a0 == 0) {
2384        tcg_out_jmp(s, tcg_code_gen_epilogue);
2385    } else {
2386        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2387        tcg_out_jmp(s, tb_ret_addr);
2388    }
2389}
2390
2391static void tcg_out_goto_tb(TCGContext *s, int which)
2392{
2393    /*
2394     * Jump displacement must be aligned for atomic patching;
2395     * see if we need to add extra nops before jump
2396     */
2397    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2398    if (gap != 1) {
2399        tcg_out_nopn(s, gap - 1);
2400    }
2401    tcg_out8(s, OPC_JMP_long); /* jmp im */
2402    set_jmp_insn_offset(s, which);
2403    tcg_out32(s, 0);
2404    set_jmp_reset_offset(s, which);
2405}
2406
2407void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2408                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2409{
2410    /* patch the branch destination */
2411    uintptr_t addr = tb->jmp_target_addr[n];
2412    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2413    /* no need to flush icache explicitly */
2414}
2415
2416static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2417                              const TCGArg args[TCG_MAX_OP_ARGS],
2418                              const int const_args[TCG_MAX_OP_ARGS])
2419{
2420    TCGArg a0, a1, a2;
2421    int c, const_a2, vexop, rexw = 0;
2422
2423#if TCG_TARGET_REG_BITS == 64
2424# define OP_32_64(x) \
2425        case glue(glue(INDEX_op_, x), _i64): \
2426            rexw = P_REXW; /* FALLTHRU */    \
2427        case glue(glue(INDEX_op_, x), _i32)
2428#else
2429# define OP_32_64(x) \
2430        case glue(glue(INDEX_op_, x), _i32)
2431#endif
2432
2433    /* Hoist the loads of the most common arguments.  */
2434    a0 = args[0];
2435    a1 = args[1];
2436    a2 = args[2];
2437    const_a2 = const_args[2];
2438
2439    switch (opc) {
2440    case INDEX_op_goto_ptr:
2441        /* jmp to the given host address (could be epilogue) */
2442        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2443        break;
2444    case INDEX_op_br:
2445        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2446        break;
2447    OP_32_64(ld8u):
2448        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2449        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2450        break;
2451    OP_32_64(ld8s):
2452        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2453        break;
2454    OP_32_64(ld16u):
2455        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2456        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2457        break;
2458    OP_32_64(ld16s):
2459        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2460        break;
2461#if TCG_TARGET_REG_BITS == 64
2462    case INDEX_op_ld32u_i64:
2463#endif
2464    case INDEX_op_ld_i32:
2465        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2466        break;
2467
2468    OP_32_64(st8):
2469        if (const_args[0]) {
2470            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2471            tcg_out8(s, a0);
2472        } else {
2473            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2474        }
2475        break;
2476    OP_32_64(st16):
2477        if (const_args[0]) {
2478            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2479            tcg_out16(s, a0);
2480        } else {
2481            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2482        }
2483        break;
2484#if TCG_TARGET_REG_BITS == 64
2485    case INDEX_op_st32_i64:
2486#endif
2487    case INDEX_op_st_i32:
2488        if (const_args[0]) {
2489            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2490            tcg_out32(s, a0);
2491        } else {
2492            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2493        }
2494        break;
2495
2496    OP_32_64(add):
2497        /* For 3-operand addition, use LEA.  */
2498        if (a0 != a1) {
2499            TCGArg c3 = 0;
2500            if (const_a2) {
2501                c3 = a2, a2 = -1;
2502            } else if (a0 == a2) {
2503                /* Watch out for dest = src + dest, since we've removed
2504                   the matching constraint on the add.  */
2505                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2506                break;
2507            }
2508
2509            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2510            break;
2511        }
2512        c = ARITH_ADD;
2513        goto gen_arith;
2514    OP_32_64(sub):
2515        c = ARITH_SUB;
2516        goto gen_arith;
2517    OP_32_64(and):
2518        c = ARITH_AND;
2519        goto gen_arith;
2520    OP_32_64(or):
2521        c = ARITH_OR;
2522        goto gen_arith;
2523    OP_32_64(xor):
2524        c = ARITH_XOR;
2525        goto gen_arith;
2526    gen_arith:
2527        if (const_a2) {
2528            tgen_arithi(s, c + rexw, a0, a2, 0);
2529        } else {
2530            tgen_arithr(s, c + rexw, a0, a2);
2531        }
2532        break;
2533
2534    OP_32_64(andc):
2535        if (const_a2) {
2536            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2537            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2538        } else {
2539            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2540        }
2541        break;
2542
2543    OP_32_64(mul):
2544        if (const_a2) {
2545            int32_t val;
2546            val = a2;
2547            if (val == (int8_t)val) {
2548                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2549                tcg_out8(s, val);
2550            } else {
2551                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2552                tcg_out32(s, val);
2553            }
2554        } else {
2555            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2556        }
2557        break;
2558
2559    OP_32_64(div2):
2560        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2561        break;
2562    OP_32_64(divu2):
2563        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2564        break;
2565
2566    OP_32_64(shl):
2567        /* For small constant 3-operand shift, use LEA.  */
2568        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2569            if (a2 - 1 == 0) {
2570                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2571                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2572            } else {
2573                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2574                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2575            }
2576            break;
2577        }
2578        c = SHIFT_SHL;
2579        vexop = OPC_SHLX;
2580        goto gen_shift_maybe_vex;
2581    OP_32_64(shr):
2582        c = SHIFT_SHR;
2583        vexop = OPC_SHRX;
2584        goto gen_shift_maybe_vex;
2585    OP_32_64(sar):
2586        c = SHIFT_SAR;
2587        vexop = OPC_SARX;
2588        goto gen_shift_maybe_vex;
2589    OP_32_64(rotl):
2590        c = SHIFT_ROL;
2591        goto gen_shift;
2592    OP_32_64(rotr):
2593        c = SHIFT_ROR;
2594        goto gen_shift;
2595    gen_shift_maybe_vex:
2596        if (have_bmi2) {
2597            if (!const_a2) {
2598                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2599                break;
2600            }
2601            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2602        }
2603        /* FALLTHRU */
2604    gen_shift:
2605        if (const_a2) {
2606            tcg_out_shifti(s, c + rexw, a0, a2);
2607        } else {
2608            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2609        }
2610        break;
2611
2612    OP_32_64(ctz):
2613        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2614        break;
2615    OP_32_64(clz):
2616        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2617        break;
2618    OP_32_64(ctpop):
2619        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2620        break;
2621
2622    case INDEX_op_brcond_i32:
2623        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2624        break;
2625    case INDEX_op_setcond_i32:
2626        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2627        break;
2628    case INDEX_op_movcond_i32:
2629        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2630        break;
2631
2632    OP_32_64(bswap16):
2633        if (a2 & TCG_BSWAP_OS) {
2634            /* Output must be sign-extended. */
2635            if (rexw) {
2636                tcg_out_bswap64(s, a0);
2637                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2638            } else {
2639                tcg_out_bswap32(s, a0);
2640                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2641            }
2642        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2643            /* Output must be zero-extended, but input isn't. */
2644            tcg_out_bswap32(s, a0);
2645            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2646        } else {
2647            tcg_out_rolw_8(s, a0);
2648        }
2649        break;
2650    OP_32_64(bswap32):
2651        tcg_out_bswap32(s, a0);
2652        if (rexw && (a2 & TCG_BSWAP_OS)) {
2653            tcg_out_ext32s(s, a0, a0);
2654        }
2655        break;
2656
2657    OP_32_64(neg):
2658        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2659        break;
2660    OP_32_64(not):
2661        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2662        break;
2663
2664    case INDEX_op_qemu_ld_i32:
2665        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2666            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2667        } else {
2668            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2669        }
2670        break;
2671    case INDEX_op_qemu_ld_i64:
2672        if (TCG_TARGET_REG_BITS == 64) {
2673            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2674        } else if (TARGET_LONG_BITS == 32) {
2675            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2676        } else {
2677            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2678        }
2679        break;
2680    case INDEX_op_qemu_st_i32:
2681    case INDEX_op_qemu_st8_i32:
2682        if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
2683            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2684        } else {
2685            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2686        }
2687        break;
2688    case INDEX_op_qemu_st_i64:
2689        if (TCG_TARGET_REG_BITS == 64) {
2690            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2691        } else if (TARGET_LONG_BITS == 32) {
2692            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2693        } else {
2694            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2695        }
2696        break;
2697
2698    OP_32_64(mulu2):
2699        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2700        break;
2701    OP_32_64(muls2):
2702        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2703        break;
2704    OP_32_64(add2):
2705        if (const_args[4]) {
2706            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2707        } else {
2708            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2709        }
2710        if (const_args[5]) {
2711            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2712        } else {
2713            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2714        }
2715        break;
2716    OP_32_64(sub2):
2717        if (const_args[4]) {
2718            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2719        } else {
2720            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2721        }
2722        if (const_args[5]) {
2723            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2724        } else {
2725            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2726        }
2727        break;
2728
2729#if TCG_TARGET_REG_BITS == 32
2730    case INDEX_op_brcond2_i32:
2731        tcg_out_brcond2(s, args, const_args, 0);
2732        break;
2733    case INDEX_op_setcond2_i32:
2734        tcg_out_setcond2(s, args, const_args);
2735        break;
2736#else /* TCG_TARGET_REG_BITS == 64 */
2737    case INDEX_op_ld32s_i64:
2738        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2739        break;
2740    case INDEX_op_ld_i64:
2741        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2742        break;
2743    case INDEX_op_st_i64:
2744        if (const_args[0]) {
2745            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2746            tcg_out32(s, a0);
2747        } else {
2748            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2749        }
2750        break;
2751
2752    case INDEX_op_brcond_i64:
2753        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2754        break;
2755    case INDEX_op_setcond_i64:
2756        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2757        break;
2758    case INDEX_op_movcond_i64:
2759        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2760        break;
2761
2762    case INDEX_op_bswap64_i64:
2763        tcg_out_bswap64(s, a0);
2764        break;
2765    case INDEX_op_extrh_i64_i32:
2766        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2767        break;
2768#endif
2769
2770    OP_32_64(deposit):
2771        if (args[3] == 0 && args[4] == 8) {
2772            /* load bits 0..7 */
2773            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2774        } else if (args[3] == 8 && args[4] == 8) {
2775            /* load bits 8..15 */
2776            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2777        } else if (args[3] == 0 && args[4] == 16) {
2778            /* load bits 0..15 */
2779            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2780        } else {
2781            g_assert_not_reached();
2782        }
2783        break;
2784
2785    case INDEX_op_extract_i64:
2786        if (a2 + args[3] == 32) {
2787            /* This is a 32-bit zero-extending right shift.  */
2788            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2789            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2790            break;
2791        }
2792        /* FALLTHRU */
2793    case INDEX_op_extract_i32:
2794        /* On the off-chance that we can use the high-byte registers.
2795           Otherwise we emit the same ext16 + shift pattern that we
2796           would have gotten from the normal tcg-op.c expansion.  */
2797        tcg_debug_assert(a2 == 8 && args[3] == 8);
2798        if (a1 < 4 && a0 < 8) {
2799            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2800        } else {
2801            tcg_out_ext16u(s, a0, a1);
2802            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2803        }
2804        break;
2805
2806    case INDEX_op_sextract_i32:
2807        /* We don't implement sextract_i64, as we cannot sign-extend to
2808           64-bits without using the REX prefix that explicitly excludes
2809           access to the high-byte registers.  */
2810        tcg_debug_assert(a2 == 8 && args[3] == 8);
2811        if (a1 < 4 && a0 < 8) {
2812            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2813        } else {
2814            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2815            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2816        }
2817        break;
2818
2819    OP_32_64(extract2):
2820        /* Note that SHRD outputs to the r/m operand.  */
2821        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2822        tcg_out8(s, args[3]);
2823        break;
2824
2825    case INDEX_op_mb:
2826        tcg_out_mb(s, a0);
2827        break;
2828    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2829    case INDEX_op_mov_i64:
2830    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2831    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2832    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2833    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2834    case INDEX_op_ext8s_i64:
2835    case INDEX_op_ext8u_i32:
2836    case INDEX_op_ext8u_i64:
2837    case INDEX_op_ext16s_i32:
2838    case INDEX_op_ext16s_i64:
2839    case INDEX_op_ext16u_i32:
2840    case INDEX_op_ext16u_i64:
2841    case INDEX_op_ext32s_i64:
2842    case INDEX_op_ext32u_i64:
2843    case INDEX_op_ext_i32_i64:
2844    case INDEX_op_extu_i32_i64:
2845    case INDEX_op_extrl_i64_i32:
2846    default:
2847        g_assert_not_reached();
2848    }
2849
2850#undef OP_32_64
2851}
2852
2853static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2854                           unsigned vecl, unsigned vece,
2855                           const TCGArg args[TCG_MAX_OP_ARGS],
2856                           const int const_args[TCG_MAX_OP_ARGS])
2857{
2858    static int const add_insn[4] = {
2859        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2860    };
2861    static int const ssadd_insn[4] = {
2862        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2863    };
2864    static int const usadd_insn[4] = {
2865        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2866    };
2867    static int const sub_insn[4] = {
2868        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2869    };
2870    static int const sssub_insn[4] = {
2871        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2872    };
2873    static int const ussub_insn[4] = {
2874        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2875    };
2876    static int const mul_insn[4] = {
2877        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2878    };
2879    static int const shift_imm_insn[4] = {
2880        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2881    };
2882    static int const cmpeq_insn[4] = {
2883        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2884    };
2885    static int const cmpgt_insn[4] = {
2886        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2887    };
2888    static int const punpckl_insn[4] = {
2889        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2890    };
2891    static int const punpckh_insn[4] = {
2892        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2893    };
2894    static int const packss_insn[4] = {
2895        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2896    };
2897    static int const packus_insn[4] = {
2898        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2899    };
2900    static int const smin_insn[4] = {
2901        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2902    };
2903    static int const smax_insn[4] = {
2904        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2905    };
2906    static int const umin_insn[4] = {
2907        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2908    };
2909    static int const umax_insn[4] = {
2910        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2911    };
2912    static int const rotlv_insn[4] = {
2913        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2914    };
2915    static int const rotrv_insn[4] = {
2916        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2917    };
2918    static int const shlv_insn[4] = {
2919        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2920    };
2921    static int const shrv_insn[4] = {
2922        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2923    };
2924    static int const sarv_insn[4] = {
2925        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2926    };
2927    static int const shls_insn[4] = {
2928        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2929    };
2930    static int const shrs_insn[4] = {
2931        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2932    };
2933    static int const sars_insn[4] = {
2934        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2935    };
2936    static int const vpshldi_insn[4] = {
2937        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2938    };
2939    static int const vpshldv_insn[4] = {
2940        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2941    };
2942    static int const vpshrdv_insn[4] = {
2943        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2944    };
2945    static int const abs_insn[4] = {
2946        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2947    };
2948
2949    TCGType type = vecl + TCG_TYPE_V64;
2950    int insn, sub;
2951    TCGArg a0, a1, a2, a3;
2952
2953    a0 = args[0];
2954    a1 = args[1];
2955    a2 = args[2];
2956
2957    switch (opc) {
2958    case INDEX_op_add_vec:
2959        insn = add_insn[vece];
2960        goto gen_simd;
2961    case INDEX_op_ssadd_vec:
2962        insn = ssadd_insn[vece];
2963        goto gen_simd;
2964    case INDEX_op_usadd_vec:
2965        insn = usadd_insn[vece];
2966        goto gen_simd;
2967    case INDEX_op_sub_vec:
2968        insn = sub_insn[vece];
2969        goto gen_simd;
2970    case INDEX_op_sssub_vec:
2971        insn = sssub_insn[vece];
2972        goto gen_simd;
2973    case INDEX_op_ussub_vec:
2974        insn = ussub_insn[vece];
2975        goto gen_simd;
2976    case INDEX_op_mul_vec:
2977        insn = mul_insn[vece];
2978        goto gen_simd;
2979    case INDEX_op_and_vec:
2980        insn = OPC_PAND;
2981        goto gen_simd;
2982    case INDEX_op_or_vec:
2983        insn = OPC_POR;
2984        goto gen_simd;
2985    case INDEX_op_xor_vec:
2986        insn = OPC_PXOR;
2987        goto gen_simd;
2988    case INDEX_op_smin_vec:
2989        insn = smin_insn[vece];
2990        goto gen_simd;
2991    case INDEX_op_umin_vec:
2992        insn = umin_insn[vece];
2993        goto gen_simd;
2994    case INDEX_op_smax_vec:
2995        insn = smax_insn[vece];
2996        goto gen_simd;
2997    case INDEX_op_umax_vec:
2998        insn = umax_insn[vece];
2999        goto gen_simd;
3000    case INDEX_op_shlv_vec:
3001        insn = shlv_insn[vece];
3002        goto gen_simd;
3003    case INDEX_op_shrv_vec:
3004        insn = shrv_insn[vece];
3005        goto gen_simd;
3006    case INDEX_op_sarv_vec:
3007        insn = sarv_insn[vece];
3008        goto gen_simd;
3009    case INDEX_op_rotlv_vec:
3010        insn = rotlv_insn[vece];
3011        goto gen_simd;
3012    case INDEX_op_rotrv_vec:
3013        insn = rotrv_insn[vece];
3014        goto gen_simd;
3015    case INDEX_op_shls_vec:
3016        insn = shls_insn[vece];
3017        goto gen_simd;
3018    case INDEX_op_shrs_vec:
3019        insn = shrs_insn[vece];
3020        goto gen_simd;
3021    case INDEX_op_sars_vec:
3022        insn = sars_insn[vece];
3023        goto gen_simd;
3024    case INDEX_op_x86_punpckl_vec:
3025        insn = punpckl_insn[vece];
3026        goto gen_simd;
3027    case INDEX_op_x86_punpckh_vec:
3028        insn = punpckh_insn[vece];
3029        goto gen_simd;
3030    case INDEX_op_x86_packss_vec:
3031        insn = packss_insn[vece];
3032        goto gen_simd;
3033    case INDEX_op_x86_packus_vec:
3034        insn = packus_insn[vece];
3035        goto gen_simd;
3036    case INDEX_op_x86_vpshldv_vec:
3037        insn = vpshldv_insn[vece];
3038        a1 = a2;
3039        a2 = args[3];
3040        goto gen_simd;
3041    case INDEX_op_x86_vpshrdv_vec:
3042        insn = vpshrdv_insn[vece];
3043        a1 = a2;
3044        a2 = args[3];
3045        goto gen_simd;
3046#if TCG_TARGET_REG_BITS == 32
3047    case INDEX_op_dup2_vec:
3048        /* First merge the two 32-bit inputs to a single 64-bit element. */
3049        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3050        /* Then replicate the 64-bit elements across the rest of the vector. */
3051        if (type != TCG_TYPE_V64) {
3052            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3053        }
3054        break;
3055#endif
3056    case INDEX_op_abs_vec:
3057        insn = abs_insn[vece];
3058        a2 = a1;
3059        a1 = 0;
3060        goto gen_simd;
3061    gen_simd:
3062        tcg_debug_assert(insn != OPC_UD2);
3063        if (type == TCG_TYPE_V256) {
3064            insn |= P_VEXL;
3065        }
3066        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3067        break;
3068
3069    case INDEX_op_cmp_vec:
3070        sub = args[3];
3071        if (sub == TCG_COND_EQ) {
3072            insn = cmpeq_insn[vece];
3073        } else if (sub == TCG_COND_GT) {
3074            insn = cmpgt_insn[vece];
3075        } else {
3076            g_assert_not_reached();
3077        }
3078        goto gen_simd;
3079
3080    case INDEX_op_andc_vec:
3081        insn = OPC_PANDN;
3082        if (type == TCG_TYPE_V256) {
3083            insn |= P_VEXL;
3084        }
3085        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3086        break;
3087
3088    case INDEX_op_shli_vec:
3089        insn = shift_imm_insn[vece];
3090        sub = 6;
3091        goto gen_shift;
3092    case INDEX_op_shri_vec:
3093        insn = shift_imm_insn[vece];
3094        sub = 2;
3095        goto gen_shift;
3096    case INDEX_op_sari_vec:
3097        if (vece == MO_64) {
3098            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3099        } else {
3100            insn = shift_imm_insn[vece];
3101        }
3102        sub = 4;
3103        goto gen_shift;
3104    case INDEX_op_rotli_vec:
3105        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3106        if (vece == MO_64) {
3107            insn |= P_VEXW;
3108        }
3109        sub = 1;
3110        goto gen_shift;
3111    gen_shift:
3112        tcg_debug_assert(vece != MO_8);
3113        if (type == TCG_TYPE_V256) {
3114            insn |= P_VEXL;
3115        }
3116        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3117        tcg_out8(s, a2);
3118        break;
3119
3120    case INDEX_op_ld_vec:
3121        tcg_out_ld(s, type, a0, a1, a2);
3122        break;
3123    case INDEX_op_st_vec:
3124        tcg_out_st(s, type, a0, a1, a2);
3125        break;
3126    case INDEX_op_dupm_vec:
3127        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3128        break;
3129
3130    case INDEX_op_x86_shufps_vec:
3131        insn = OPC_SHUFPS;
3132        sub = args[3];
3133        goto gen_simd_imm8;
3134    case INDEX_op_x86_blend_vec:
3135        if (vece == MO_16) {
3136            insn = OPC_PBLENDW;
3137        } else if (vece == MO_32) {
3138            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3139        } else {
3140            g_assert_not_reached();
3141        }
3142        sub = args[3];
3143        goto gen_simd_imm8;
3144    case INDEX_op_x86_vperm2i128_vec:
3145        insn = OPC_VPERM2I128;
3146        sub = args[3];
3147        goto gen_simd_imm8;
3148    case INDEX_op_x86_vpshldi_vec:
3149        insn = vpshldi_insn[vece];
3150        sub = args[3];
3151        goto gen_simd_imm8;
3152
3153    case INDEX_op_not_vec:
3154        insn = OPC_VPTERNLOGQ;
3155        a2 = a1;
3156        sub = 0x33; /* !B */
3157        goto gen_simd_imm8;
3158    case INDEX_op_nor_vec:
3159        insn = OPC_VPTERNLOGQ;
3160        sub = 0x11; /* norCB */
3161        goto gen_simd_imm8;
3162    case INDEX_op_nand_vec:
3163        insn = OPC_VPTERNLOGQ;
3164        sub = 0x77; /* nandCB */
3165        goto gen_simd_imm8;
3166    case INDEX_op_eqv_vec:
3167        insn = OPC_VPTERNLOGQ;
3168        sub = 0x99; /* xnorCB */
3169        goto gen_simd_imm8;
3170    case INDEX_op_orc_vec:
3171        insn = OPC_VPTERNLOGQ;
3172        sub = 0xdd; /* orB!C */
3173        goto gen_simd_imm8;
3174
3175    case INDEX_op_bitsel_vec:
3176        insn = OPC_VPTERNLOGQ;
3177        a3 = args[3];
3178        if (a0 == a1) {
3179            a1 = a2;
3180            a2 = a3;
3181            sub = 0xca; /* A?B:C */
3182        } else if (a0 == a2) {
3183            a2 = a3;
3184            sub = 0xe2; /* B?A:C */
3185        } else {
3186            tcg_out_mov(s, type, a0, a3);
3187            sub = 0xb8; /* B?C:A */
3188        }
3189        goto gen_simd_imm8;
3190
3191    gen_simd_imm8:
3192        tcg_debug_assert(insn != OPC_UD2);
3193        if (type == TCG_TYPE_V256) {
3194            insn |= P_VEXL;
3195        }
3196        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3197        tcg_out8(s, sub);
3198        break;
3199
3200    case INDEX_op_x86_vpblendvb_vec:
3201        insn = OPC_VPBLENDVB;
3202        if (type == TCG_TYPE_V256) {
3203            insn |= P_VEXL;
3204        }
3205        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3206        tcg_out8(s, args[3] << 4);
3207        break;
3208
3209    case INDEX_op_x86_psrldq_vec:
3210        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3211        tcg_out8(s, a2);
3212        break;
3213
3214    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3215    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3216    default:
3217        g_assert_not_reached();
3218    }
3219}
3220
3221static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3222{
3223    switch (op) {
3224    case INDEX_op_goto_ptr:
3225        return C_O0_I1(r);
3226
3227    case INDEX_op_ld8u_i32:
3228    case INDEX_op_ld8u_i64:
3229    case INDEX_op_ld8s_i32:
3230    case INDEX_op_ld8s_i64:
3231    case INDEX_op_ld16u_i32:
3232    case INDEX_op_ld16u_i64:
3233    case INDEX_op_ld16s_i32:
3234    case INDEX_op_ld16s_i64:
3235    case INDEX_op_ld_i32:
3236    case INDEX_op_ld32u_i64:
3237    case INDEX_op_ld32s_i64:
3238    case INDEX_op_ld_i64:
3239        return C_O1_I1(r, r);
3240
3241    case INDEX_op_st8_i32:
3242    case INDEX_op_st8_i64:
3243        return C_O0_I2(qi, r);
3244
3245    case INDEX_op_st16_i32:
3246    case INDEX_op_st16_i64:
3247    case INDEX_op_st_i32:
3248    case INDEX_op_st32_i64:
3249        return C_O0_I2(ri, r);
3250
3251    case INDEX_op_st_i64:
3252        return C_O0_I2(re, r);
3253
3254    case INDEX_op_add_i32:
3255    case INDEX_op_add_i64:
3256        return C_O1_I2(r, r, re);
3257
3258    case INDEX_op_sub_i32:
3259    case INDEX_op_sub_i64:
3260    case INDEX_op_mul_i32:
3261    case INDEX_op_mul_i64:
3262    case INDEX_op_or_i32:
3263    case INDEX_op_or_i64:
3264    case INDEX_op_xor_i32:
3265    case INDEX_op_xor_i64:
3266        return C_O1_I2(r, 0, re);
3267
3268    case INDEX_op_and_i32:
3269    case INDEX_op_and_i64:
3270        return C_O1_I2(r, 0, reZ);
3271
3272    case INDEX_op_andc_i32:
3273    case INDEX_op_andc_i64:
3274        return C_O1_I2(r, r, rI);
3275
3276    case INDEX_op_shl_i32:
3277    case INDEX_op_shl_i64:
3278    case INDEX_op_shr_i32:
3279    case INDEX_op_shr_i64:
3280    case INDEX_op_sar_i32:
3281    case INDEX_op_sar_i64:
3282        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3283
3284    case INDEX_op_rotl_i32:
3285    case INDEX_op_rotl_i64:
3286    case INDEX_op_rotr_i32:
3287    case INDEX_op_rotr_i64:
3288        return C_O1_I2(r, 0, ci);
3289
3290    case INDEX_op_brcond_i32:
3291    case INDEX_op_brcond_i64:
3292        return C_O0_I2(r, re);
3293
3294    case INDEX_op_bswap16_i32:
3295    case INDEX_op_bswap16_i64:
3296    case INDEX_op_bswap32_i32:
3297    case INDEX_op_bswap32_i64:
3298    case INDEX_op_bswap64_i64:
3299    case INDEX_op_neg_i32:
3300    case INDEX_op_neg_i64:
3301    case INDEX_op_not_i32:
3302    case INDEX_op_not_i64:
3303    case INDEX_op_extrh_i64_i32:
3304        return C_O1_I1(r, 0);
3305
3306    case INDEX_op_ext8s_i32:
3307    case INDEX_op_ext8s_i64:
3308    case INDEX_op_ext8u_i32:
3309    case INDEX_op_ext8u_i64:
3310        return C_O1_I1(r, q);
3311
3312    case INDEX_op_ext16s_i32:
3313    case INDEX_op_ext16s_i64:
3314    case INDEX_op_ext16u_i32:
3315    case INDEX_op_ext16u_i64:
3316    case INDEX_op_ext32s_i64:
3317    case INDEX_op_ext32u_i64:
3318    case INDEX_op_ext_i32_i64:
3319    case INDEX_op_extu_i32_i64:
3320    case INDEX_op_extrl_i64_i32:
3321    case INDEX_op_extract_i32:
3322    case INDEX_op_extract_i64:
3323    case INDEX_op_sextract_i32:
3324    case INDEX_op_ctpop_i32:
3325    case INDEX_op_ctpop_i64:
3326        return C_O1_I1(r, r);
3327
3328    case INDEX_op_extract2_i32:
3329    case INDEX_op_extract2_i64:
3330        return C_O1_I2(r, 0, r);
3331
3332    case INDEX_op_deposit_i32:
3333    case INDEX_op_deposit_i64:
3334        return C_O1_I2(Q, 0, Q);
3335
3336    case INDEX_op_setcond_i32:
3337    case INDEX_op_setcond_i64:
3338        return C_O1_I2(q, r, re);
3339
3340    case INDEX_op_movcond_i32:
3341    case INDEX_op_movcond_i64:
3342        return C_O1_I4(r, r, re, r, 0);
3343
3344    case INDEX_op_div2_i32:
3345    case INDEX_op_div2_i64:
3346    case INDEX_op_divu2_i32:
3347    case INDEX_op_divu2_i64:
3348        return C_O2_I3(a, d, 0, 1, r);
3349
3350    case INDEX_op_mulu2_i32:
3351    case INDEX_op_mulu2_i64:
3352    case INDEX_op_muls2_i32:
3353    case INDEX_op_muls2_i64:
3354        return C_O2_I2(a, d, a, r);
3355
3356    case INDEX_op_add2_i32:
3357    case INDEX_op_add2_i64:
3358    case INDEX_op_sub2_i32:
3359    case INDEX_op_sub2_i64:
3360        return C_O2_I4(r, r, 0, 1, re, re);
3361
3362    case INDEX_op_ctz_i32:
3363    case INDEX_op_ctz_i64:
3364        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3365
3366    case INDEX_op_clz_i32:
3367    case INDEX_op_clz_i64:
3368        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3369
3370    case INDEX_op_qemu_ld_i32:
3371        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3372                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3373
3374    case INDEX_op_qemu_st_i32:
3375        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3376                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3377    case INDEX_op_qemu_st8_i32:
3378        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3379                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3380
3381    case INDEX_op_qemu_ld_i64:
3382        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3383                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3384                : C_O2_I2(r, r, L, L));
3385
3386    case INDEX_op_qemu_st_i64:
3387        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3388                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3389                : C_O0_I4(L, L, L, L));
3390
3391    case INDEX_op_brcond2_i32:
3392        return C_O0_I4(r, r, ri, ri);
3393
3394    case INDEX_op_setcond2_i32:
3395        return C_O1_I4(r, r, r, ri, ri);
3396
3397    case INDEX_op_ld_vec:
3398    case INDEX_op_dupm_vec:
3399        return C_O1_I1(x, r);
3400
3401    case INDEX_op_st_vec:
3402        return C_O0_I2(x, r);
3403
3404    case INDEX_op_add_vec:
3405    case INDEX_op_sub_vec:
3406    case INDEX_op_mul_vec:
3407    case INDEX_op_and_vec:
3408    case INDEX_op_or_vec:
3409    case INDEX_op_xor_vec:
3410    case INDEX_op_andc_vec:
3411    case INDEX_op_orc_vec:
3412    case INDEX_op_nand_vec:
3413    case INDEX_op_nor_vec:
3414    case INDEX_op_eqv_vec:
3415    case INDEX_op_ssadd_vec:
3416    case INDEX_op_usadd_vec:
3417    case INDEX_op_sssub_vec:
3418    case INDEX_op_ussub_vec:
3419    case INDEX_op_smin_vec:
3420    case INDEX_op_umin_vec:
3421    case INDEX_op_smax_vec:
3422    case INDEX_op_umax_vec:
3423    case INDEX_op_shlv_vec:
3424    case INDEX_op_shrv_vec:
3425    case INDEX_op_sarv_vec:
3426    case INDEX_op_rotlv_vec:
3427    case INDEX_op_rotrv_vec:
3428    case INDEX_op_shls_vec:
3429    case INDEX_op_shrs_vec:
3430    case INDEX_op_sars_vec:
3431    case INDEX_op_cmp_vec:
3432    case INDEX_op_x86_shufps_vec:
3433    case INDEX_op_x86_blend_vec:
3434    case INDEX_op_x86_packss_vec:
3435    case INDEX_op_x86_packus_vec:
3436    case INDEX_op_x86_vperm2i128_vec:
3437    case INDEX_op_x86_punpckl_vec:
3438    case INDEX_op_x86_punpckh_vec:
3439    case INDEX_op_x86_vpshldi_vec:
3440#if TCG_TARGET_REG_BITS == 32
3441    case INDEX_op_dup2_vec:
3442#endif
3443        return C_O1_I2(x, x, x);
3444
3445    case INDEX_op_abs_vec:
3446    case INDEX_op_dup_vec:
3447    case INDEX_op_not_vec:
3448    case INDEX_op_shli_vec:
3449    case INDEX_op_shri_vec:
3450    case INDEX_op_sari_vec:
3451    case INDEX_op_rotli_vec:
3452    case INDEX_op_x86_psrldq_vec:
3453        return C_O1_I1(x, x);
3454
3455    case INDEX_op_x86_vpshldv_vec:
3456    case INDEX_op_x86_vpshrdv_vec:
3457        return C_O1_I3(x, 0, x, x);
3458
3459    case INDEX_op_bitsel_vec:
3460    case INDEX_op_x86_vpblendvb_vec:
3461        return C_O1_I3(x, x, x, x);
3462
3463    default:
3464        g_assert_not_reached();
3465    }
3466}
3467
3468int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3469{
3470    switch (opc) {
3471    case INDEX_op_add_vec:
3472    case INDEX_op_sub_vec:
3473    case INDEX_op_and_vec:
3474    case INDEX_op_or_vec:
3475    case INDEX_op_xor_vec:
3476    case INDEX_op_andc_vec:
3477    case INDEX_op_orc_vec:
3478    case INDEX_op_nand_vec:
3479    case INDEX_op_nor_vec:
3480    case INDEX_op_eqv_vec:
3481    case INDEX_op_not_vec:
3482    case INDEX_op_bitsel_vec:
3483        return 1;
3484    case INDEX_op_cmp_vec:
3485    case INDEX_op_cmpsel_vec:
3486        return -1;
3487
3488    case INDEX_op_rotli_vec:
3489        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3490
3491    case INDEX_op_shli_vec:
3492    case INDEX_op_shri_vec:
3493        /* We must expand the operation for MO_8.  */
3494        return vece == MO_8 ? -1 : 1;
3495
3496    case INDEX_op_sari_vec:
3497        switch (vece) {
3498        case MO_8:
3499            return -1;
3500        case MO_16:
3501        case MO_32:
3502            return 1;
3503        case MO_64:
3504            if (have_avx512vl) {
3505                return 1;
3506            }
3507            /*
3508             * We can emulate this for MO_64, but it does not pay off
3509             * unless we're producing at least 4 values.
3510             */
3511            return type >= TCG_TYPE_V256 ? -1 : 0;
3512        }
3513        return 0;
3514
3515    case INDEX_op_shls_vec:
3516    case INDEX_op_shrs_vec:
3517        return vece >= MO_16;
3518    case INDEX_op_sars_vec:
3519        switch (vece) {
3520        case MO_16:
3521        case MO_32:
3522            return 1;
3523        case MO_64:
3524            return have_avx512vl;
3525        }
3526        return 0;
3527    case INDEX_op_rotls_vec:
3528        return vece >= MO_16 ? -1 : 0;
3529
3530    case INDEX_op_shlv_vec:
3531    case INDEX_op_shrv_vec:
3532        switch (vece) {
3533        case MO_16:
3534            return have_avx512bw;
3535        case MO_32:
3536        case MO_64:
3537            return have_avx2;
3538        }
3539        return 0;
3540    case INDEX_op_sarv_vec:
3541        switch (vece) {
3542        case MO_16:
3543            return have_avx512bw;
3544        case MO_32:
3545            return have_avx2;
3546        case MO_64:
3547            return have_avx512vl;
3548        }
3549        return 0;
3550    case INDEX_op_rotlv_vec:
3551    case INDEX_op_rotrv_vec:
3552        switch (vece) {
3553        case MO_16:
3554            return have_avx512vbmi2 ? -1 : 0;
3555        case MO_32:
3556        case MO_64:
3557            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3558        }
3559        return 0;
3560
3561    case INDEX_op_mul_vec:
3562        switch (vece) {
3563        case MO_8:
3564            return -1;
3565        case MO_64:
3566            return have_avx512dq;
3567        }
3568        return 1;
3569
3570    case INDEX_op_ssadd_vec:
3571    case INDEX_op_usadd_vec:
3572    case INDEX_op_sssub_vec:
3573    case INDEX_op_ussub_vec:
3574        return vece <= MO_16;
3575    case INDEX_op_smin_vec:
3576    case INDEX_op_smax_vec:
3577    case INDEX_op_umin_vec:
3578    case INDEX_op_umax_vec:
3579    case INDEX_op_abs_vec:
3580        return vece <= MO_32 || have_avx512vl;
3581
3582    default:
3583        return 0;
3584    }
3585}
3586
3587static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3588                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3589{
3590    TCGv_vec t1, t2;
3591
3592    tcg_debug_assert(vece == MO_8);
3593
3594    t1 = tcg_temp_new_vec(type);
3595    t2 = tcg_temp_new_vec(type);
3596
3597    /*
3598     * Unpack to W, shift, and repack.  Tricky bits:
3599     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3600     *     i.e. duplicate in other half of the 16-bit lane.
3601     * (2) For right-shift, add 8 so that the high half of the lane
3602     *     becomes zero.  For left-shift, and left-rotate, we must
3603     *     shift up and down again.
3604     * (3) Step 2 leaves high half zero such that PACKUSWB
3605     *     (pack with unsigned saturation) does not modify
3606     *     the quantity.
3607     */
3608    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3609              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3610    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3611              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3612
3613    if (opc != INDEX_op_rotli_vec) {
3614        imm += 8;
3615    }
3616    if (opc == INDEX_op_shri_vec) {
3617        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3618        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3619    } else {
3620        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3621        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3622        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3623        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3624    }
3625
3626    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3627              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3628    tcg_temp_free_vec(t1);
3629    tcg_temp_free_vec(t2);
3630}
3631
3632static void expand_vec_sari(TCGType type, unsigned vece,
3633                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3634{
3635    TCGv_vec t1, t2;
3636
3637    switch (vece) {
3638    case MO_8:
3639        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3640        t1 = tcg_temp_new_vec(type);
3641        t2 = tcg_temp_new_vec(type);
3642        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3643                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3644        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3645                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3646        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3647        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3648        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3649                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3650        tcg_temp_free_vec(t1);
3651        tcg_temp_free_vec(t2);
3652        break;
3653
3654    case MO_64:
3655        t1 = tcg_temp_new_vec(type);
3656        if (imm <= 32) {
3657            /*
3658             * We can emulate a small sign extend by performing an arithmetic
3659             * 32-bit shift and overwriting the high half of a 64-bit logical
3660             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3661             * does not, so we have to bound the smaller shift -- we get the
3662             * same result in the high half either way.
3663             */
3664            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3665            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3666            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3667                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3668                      tcgv_vec_arg(t1), 0xaa);
3669        } else {
3670            /* Otherwise we will need to use a compare vs 0 to produce
3671             * the sign-extend, shift and merge.
3672             */
3673            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3674                            tcg_constant_vec(type, MO_64, 0), v1);
3675            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3676            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3677            tcg_gen_or_vec(MO_64, v0, v0, t1);
3678        }
3679        tcg_temp_free_vec(t1);
3680        break;
3681
3682    default:
3683        g_assert_not_reached();
3684    }
3685}
3686
3687static void expand_vec_rotli(TCGType type, unsigned vece,
3688                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3689{
3690    TCGv_vec t;
3691
3692    if (vece == MO_8) {
3693        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3694        return;
3695    }
3696
3697    if (have_avx512vbmi2) {
3698        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3699                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3700        return;
3701    }
3702
3703    t = tcg_temp_new_vec(type);
3704    tcg_gen_shli_vec(vece, t, v1, imm);
3705    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3706    tcg_gen_or_vec(vece, v0, v0, t);
3707    tcg_temp_free_vec(t);
3708}
3709
3710static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3711                            TCGv_vec v1, TCGv_vec sh, bool right)
3712{
3713    TCGv_vec t;
3714
3715    if (have_avx512vbmi2) {
3716        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3717                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3718                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3719        return;
3720    }
3721
3722    t = tcg_temp_new_vec(type);
3723    tcg_gen_dupi_vec(vece, t, 8 << vece);
3724    tcg_gen_sub_vec(vece, t, t, sh);
3725    if (right) {
3726        tcg_gen_shlv_vec(vece, t, v1, t);
3727        tcg_gen_shrv_vec(vece, v0, v1, sh);
3728    } else {
3729        tcg_gen_shrv_vec(vece, t, v1, t);
3730        tcg_gen_shlv_vec(vece, v0, v1, sh);
3731    }
3732    tcg_gen_or_vec(vece, v0, v0, t);
3733    tcg_temp_free_vec(t);
3734}
3735
3736static void expand_vec_rotls(TCGType type, unsigned vece,
3737                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3738{
3739    TCGv_vec t = tcg_temp_new_vec(type);
3740
3741    tcg_debug_assert(vece != MO_8);
3742
3743    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3744        tcg_gen_dup_i32_vec(vece, t, lsh);
3745        if (vece >= MO_32) {
3746            tcg_gen_rotlv_vec(vece, v0, v1, t);
3747        } else {
3748            expand_vec_rotv(type, vece, v0, v1, t, false);
3749        }
3750    } else {
3751        TCGv_i32 rsh = tcg_temp_new_i32();
3752
3753        tcg_gen_neg_i32(rsh, lsh);
3754        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3755        tcg_gen_shls_vec(vece, t, v1, lsh);
3756        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3757        tcg_gen_or_vec(vece, v0, v0, t);
3758
3759        tcg_temp_free_i32(rsh);
3760    }
3761
3762    tcg_temp_free_vec(t);
3763}
3764
3765static void expand_vec_mul(TCGType type, unsigned vece,
3766                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3767{
3768    TCGv_vec t1, t2, t3, t4, zero;
3769
3770    tcg_debug_assert(vece == MO_8);
3771
3772    /*
3773     * Unpack v1 bytes to words, 0 | x.
3774     * Unpack v2 bytes to words, y | 0.
3775     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3776     * Shift logical right by 8 bits to clear the high 8 bytes before
3777     * using an unsigned saturated pack.
3778     *
3779     * The difference between the V64, V128 and V256 cases is merely how
3780     * we distribute the expansion between temporaries.
3781     */
3782    switch (type) {
3783    case TCG_TYPE_V64:
3784        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3785        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3786        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3787        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3788                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3789        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3790                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3791        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3792        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3793        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3794                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3795        tcg_temp_free_vec(t1);
3796        tcg_temp_free_vec(t2);
3797        break;
3798
3799    case TCG_TYPE_V128:
3800    case TCG_TYPE_V256:
3801        t1 = tcg_temp_new_vec(type);
3802        t2 = tcg_temp_new_vec(type);
3803        t3 = tcg_temp_new_vec(type);
3804        t4 = tcg_temp_new_vec(type);
3805        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3806        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3807                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3808        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3809                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3810        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3811                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3812        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3813                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3814        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3815        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3816        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3817        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3818        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3819                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3820        tcg_temp_free_vec(t1);
3821        tcg_temp_free_vec(t2);
3822        tcg_temp_free_vec(t3);
3823        tcg_temp_free_vec(t4);
3824        break;
3825
3826    default:
3827        g_assert_not_reached();
3828    }
3829}
3830
3831static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3832                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3833{
3834    enum {
3835        NEED_INV  = 1,
3836        NEED_SWAP = 2,
3837        NEED_BIAS = 4,
3838        NEED_UMIN = 8,
3839        NEED_UMAX = 16,
3840    };
3841    TCGv_vec t1, t2, t3;
3842    uint8_t fixup;
3843
3844    switch (cond) {
3845    case TCG_COND_EQ:
3846    case TCG_COND_GT:
3847        fixup = 0;
3848        break;
3849    case TCG_COND_NE:
3850    case TCG_COND_LE:
3851        fixup = NEED_INV;
3852        break;
3853    case TCG_COND_LT:
3854        fixup = NEED_SWAP;
3855        break;
3856    case TCG_COND_GE:
3857        fixup = NEED_SWAP | NEED_INV;
3858        break;
3859    case TCG_COND_LEU:
3860        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3861            fixup = NEED_UMIN;
3862        } else {
3863            fixup = NEED_BIAS | NEED_INV;
3864        }
3865        break;
3866    case TCG_COND_GTU:
3867        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3868            fixup = NEED_UMIN | NEED_INV;
3869        } else {
3870            fixup = NEED_BIAS;
3871        }
3872        break;
3873    case TCG_COND_GEU:
3874        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3875            fixup = NEED_UMAX;
3876        } else {
3877            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3878        }
3879        break;
3880    case TCG_COND_LTU:
3881        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3882            fixup = NEED_UMAX | NEED_INV;
3883        } else {
3884            fixup = NEED_BIAS | NEED_SWAP;
3885        }
3886        break;
3887    default:
3888        g_assert_not_reached();
3889    }
3890
3891    if (fixup & NEED_INV) {
3892        cond = tcg_invert_cond(cond);
3893    }
3894    if (fixup & NEED_SWAP) {
3895        t1 = v1, v1 = v2, v2 = t1;
3896        cond = tcg_swap_cond(cond);
3897    }
3898
3899    t1 = t2 = NULL;
3900    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3901        t1 = tcg_temp_new_vec(type);
3902        if (fixup & NEED_UMIN) {
3903            tcg_gen_umin_vec(vece, t1, v1, v2);
3904        } else {
3905            tcg_gen_umax_vec(vece, t1, v1, v2);
3906        }
3907        v2 = t1;
3908        cond = TCG_COND_EQ;
3909    } else if (fixup & NEED_BIAS) {
3910        t1 = tcg_temp_new_vec(type);
3911        t2 = tcg_temp_new_vec(type);
3912        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3913        tcg_gen_sub_vec(vece, t1, v1, t3);
3914        tcg_gen_sub_vec(vece, t2, v2, t3);
3915        v1 = t1;
3916        v2 = t2;
3917        cond = tcg_signed_cond(cond);
3918    }
3919
3920    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3921    /* Expand directly; do not recurse.  */
3922    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3923              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3924
3925    if (t1) {
3926        tcg_temp_free_vec(t1);
3927        if (t2) {
3928            tcg_temp_free_vec(t2);
3929        }
3930    }
3931    return fixup & NEED_INV;
3932}
3933
3934static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3935                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3936{
3937    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3938        tcg_gen_not_vec(vece, v0, v0);
3939    }
3940}
3941
3942static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3943                              TCGv_vec c1, TCGv_vec c2,
3944                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3945{
3946    TCGv_vec t = tcg_temp_new_vec(type);
3947
3948    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3949        /* Invert the sense of the compare by swapping arguments.  */
3950        TCGv_vec x;
3951        x = v3, v3 = v4, v4 = x;
3952    }
3953    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3954              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3955              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3956    tcg_temp_free_vec(t);
3957}
3958
3959void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3960                       TCGArg a0, ...)
3961{
3962    va_list va;
3963    TCGArg a2;
3964    TCGv_vec v0, v1, v2, v3, v4;
3965
3966    va_start(va, a0);
3967    v0 = temp_tcgv_vec(arg_temp(a0));
3968    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3969    a2 = va_arg(va, TCGArg);
3970
3971    switch (opc) {
3972    case INDEX_op_shli_vec:
3973    case INDEX_op_shri_vec:
3974        expand_vec_shi(type, vece, opc, v0, v1, a2);
3975        break;
3976
3977    case INDEX_op_sari_vec:
3978        expand_vec_sari(type, vece, v0, v1, a2);
3979        break;
3980
3981    case INDEX_op_rotli_vec:
3982        expand_vec_rotli(type, vece, v0, v1, a2);
3983        break;
3984
3985    case INDEX_op_rotls_vec:
3986        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3987        break;
3988
3989    case INDEX_op_rotlv_vec:
3990        v2 = temp_tcgv_vec(arg_temp(a2));
3991        expand_vec_rotv(type, vece, v0, v1, v2, false);
3992        break;
3993    case INDEX_op_rotrv_vec:
3994        v2 = temp_tcgv_vec(arg_temp(a2));
3995        expand_vec_rotv(type, vece, v0, v1, v2, true);
3996        break;
3997
3998    case INDEX_op_mul_vec:
3999        v2 = temp_tcgv_vec(arg_temp(a2));
4000        expand_vec_mul(type, vece, v0, v1, v2);
4001        break;
4002
4003    case INDEX_op_cmp_vec:
4004        v2 = temp_tcgv_vec(arg_temp(a2));
4005        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4006        break;
4007
4008    case INDEX_op_cmpsel_vec:
4009        v2 = temp_tcgv_vec(arg_temp(a2));
4010        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4011        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4012        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4013        break;
4014
4015    default:
4016        break;
4017    }
4018
4019    va_end(va);
4020}
4021
4022static const int tcg_target_callee_save_regs[] = {
4023#if TCG_TARGET_REG_BITS == 64
4024    TCG_REG_RBP,
4025    TCG_REG_RBX,
4026#if defined(_WIN64)
4027    TCG_REG_RDI,
4028    TCG_REG_RSI,
4029#endif
4030    TCG_REG_R12,
4031    TCG_REG_R13,
4032    TCG_REG_R14, /* Currently used for the global env. */
4033    TCG_REG_R15,
4034#else
4035    TCG_REG_EBP, /* Currently used for the global env. */
4036    TCG_REG_EBX,
4037    TCG_REG_ESI,
4038    TCG_REG_EDI,
4039#endif
4040};
4041
4042/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4043   and tcg_register_jit.  */
4044
4045#define PUSH_SIZE \
4046    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4047     * (TCG_TARGET_REG_BITS / 8))
4048
4049#define FRAME_SIZE \
4050    ((PUSH_SIZE \
4051      + TCG_STATIC_CALL_ARGS_SIZE \
4052      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4053      + TCG_TARGET_STACK_ALIGN - 1) \
4054     & ~(TCG_TARGET_STACK_ALIGN - 1))
4055
4056/* Generate global QEMU prologue and epilogue code */
4057static void tcg_target_qemu_prologue(TCGContext *s)
4058{
4059    int i, stack_addend;
4060
4061    /* TB prologue */
4062
4063    /* Reserve some stack space, also for TCG temps.  */
4064    stack_addend = FRAME_SIZE - PUSH_SIZE;
4065    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4066                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4067
4068    /* Save all callee saved registers.  */
4069    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4070        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4071    }
4072
4073#if TCG_TARGET_REG_BITS == 32
4074    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4075               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4076    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4077    /* jmp *tb.  */
4078    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4079                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4080                         + stack_addend);
4081#else
4082# if !defined(CONFIG_SOFTMMU)
4083    if (guest_base) {
4084        int seg = setup_guest_base_seg();
4085        if (seg != 0) {
4086            x86_guest_base.seg = seg;
4087        } else if (guest_base == (int32_t)guest_base) {
4088            x86_guest_base.ofs = guest_base;
4089        } else {
4090            /* Choose R12 because, as a base, it requires a SIB byte. */
4091            x86_guest_base.index = TCG_REG_R12;
4092            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4093            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4094        }
4095    }
4096# endif
4097    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4098    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4099    /* jmp *tb.  */
4100    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4101#endif
4102
4103    /*
4104     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4105     * and fall through to the rest of the epilogue.
4106     */
4107    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4108    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4109
4110    /* TB epilogue */
4111    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4112
4113    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4114
4115    if (have_avx2) {
4116        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4117    }
4118    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4119        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4120    }
4121    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4122}
4123
4124static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4125{
4126    memset(p, 0x90, count);
4127}
4128
4129static void tcg_target_init(TCGContext *s)
4130{
4131#ifdef CONFIG_CPUID_H
4132    unsigned a, b, c, d, b7 = 0, c7 = 0;
4133    unsigned max = __get_cpuid_max(0, 0);
4134
4135    if (max >= 7) {
4136        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
4137        __cpuid_count(7, 0, a, b7, c7, d);
4138        have_bmi1 = (b7 & bit_BMI) != 0;
4139        have_bmi2 = (b7 & bit_BMI2) != 0;
4140    }
4141
4142    if (max >= 1) {
4143        __cpuid(1, a, b, c, d);
4144#ifndef have_cmov
4145        /* For 32-bit, 99% certainty that we're running on hardware that
4146           supports cmov, but we still need to check.  In case cmov is not
4147           available, we'll use a small forward branch.  */
4148        have_cmov = (d & bit_CMOV) != 0;
4149#endif
4150
4151        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
4152           need to probe for it.  */
4153        have_movbe = (c & bit_MOVBE) != 0;
4154        have_popcnt = (c & bit_POPCNT) != 0;
4155
4156        /* There are a number of things we must check before we can be
4157           sure of not hitting invalid opcode.  */
4158        if (c & bit_OSXSAVE) {
4159            unsigned bv = xgetbv_low(0);
4160
4161            if ((bv & 6) == 6) {
4162                have_avx1 = (c & bit_AVX) != 0;
4163                have_avx2 = (b7 & bit_AVX2) != 0;
4164
4165                /*
4166                 * There are interesting instructions in AVX512, so long
4167                 * as we have AVX512VL, which indicates support for EVEX
4168                 * on sizes smaller than 512 bits.  We are required to
4169                 * check that OPMASK and all extended ZMM state are enabled
4170                 * even if we're not using them -- the insns will fault.
4171                 */
4172                if ((bv & 0xe0) == 0xe0
4173                    && (b7 & bit_AVX512F)
4174                    && (b7 & bit_AVX512VL)) {
4175                    have_avx512vl = true;
4176                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
4177                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
4178                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
4179                }
4180            }
4181        }
4182    }
4183
4184    max = __get_cpuid_max(0x8000000, 0);
4185    if (max >= 1) {
4186        __cpuid(0x80000001, a, b, c, d);
4187        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4188        have_lzcnt = (c & bit_LZCNT) != 0;
4189    }
4190#endif /* CONFIG_CPUID_H */
4191
4192    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4193    if (TCG_TARGET_REG_BITS == 64) {
4194        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4195    }
4196    if (have_avx1) {
4197        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4198        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4199    }
4200    if (have_avx2) {
4201        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4202    }
4203
4204    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4205    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4206    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4207    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4208    if (TCG_TARGET_REG_BITS == 64) {
4209#if !defined(_WIN64)
4210        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4211        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4212#endif
4213        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4214        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4215        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4216        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4217    }
4218
4219    s->reserved_regs = 0;
4220    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4221#ifdef _WIN64
4222    /* These are call saved, and we don't save them, so don't use them. */
4223    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4224    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4225    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4226    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4227    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4228    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4229    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4230    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4231    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4232    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4233#endif
4234}
4235
4236typedef struct {
4237    DebugFrameHeader h;
4238    uint8_t fde_def_cfa[4];
4239    uint8_t fde_reg_ofs[14];
4240} DebugFrame;
4241
4242/* We're expecting a 2 byte uleb128 encoded value.  */
4243QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4244
4245#if !defined(__ELF__)
4246    /* Host machine without ELF. */
4247#elif TCG_TARGET_REG_BITS == 64
4248#define ELF_HOST_MACHINE EM_X86_64
4249static const DebugFrame debug_frame = {
4250    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4251    .h.cie.id = -1,
4252    .h.cie.version = 1,
4253    .h.cie.code_align = 1,
4254    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4255    .h.cie.return_column = 16,
4256
4257    /* Total FDE size does not include the "len" member.  */
4258    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4259
4260    .fde_def_cfa = {
4261        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4262        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4263        (FRAME_SIZE >> 7)
4264    },
4265    .fde_reg_ofs = {
4266        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4267        /* The following ordering must match tcg_target_callee_save_regs.  */
4268        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4269        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4270        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4271        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4272        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4273        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4274    }
4275};
4276#else
4277#define ELF_HOST_MACHINE EM_386
4278static const DebugFrame debug_frame = {
4279    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4280    .h.cie.id = -1,
4281    .h.cie.version = 1,
4282    .h.cie.code_align = 1,
4283    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4284    .h.cie.return_column = 8,
4285
4286    /* Total FDE size does not include the "len" member.  */
4287    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4288
4289    .fde_def_cfa = {
4290        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4291        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4292        (FRAME_SIZE >> 7)
4293    },
4294    .fde_reg_ofs = {
4295        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4296        /* The following ordering must match tcg_target_callee_save_regs.  */
4297        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4298        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4299        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4300        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4301    }
4302};
4303#endif
4304
4305#if defined(ELF_HOST_MACHINE)
4306void tcg_register_jit(const void *buf, size_t buf_size)
4307{
4308    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4309}
4310#endif
4311