xref: /qemu/tcg/i386/tcg-target.c.inc (revision 7c1f51bf)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
113{
114    switch (kind) {
115    case TCG_CALL_RET_NORMAL:
116        tcg_debug_assert(slot >= 0 && slot <= 1);
117        return slot ? TCG_REG_EDX : TCG_REG_EAX;
118#ifdef _WIN64
119    case TCG_CALL_RET_BY_VEC:
120        tcg_debug_assert(slot == 0);
121        return TCG_REG_XMM0;
122#endif
123    default:
124        g_assert_not_reached();
125    }
126}
127
128/* Constants we accept.  */
129#define TCG_CT_CONST_S32 0x100
130#define TCG_CT_CONST_U32 0x200
131#define TCG_CT_CONST_I32 0x400
132#define TCG_CT_CONST_WSZ 0x800
133
134/* Registers used with L constraint, which are the first argument
135   registers on x86_64, and two random call clobbered registers on
136   i386. */
137#if TCG_TARGET_REG_BITS == 64
138# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
139# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
140#else
141# define TCG_REG_L0 TCG_REG_EAX
142# define TCG_REG_L1 TCG_REG_EDX
143#endif
144
145#define ALL_BYTEH_REGS         0x0000000fu
146#if TCG_TARGET_REG_BITS == 64
147# define ALL_GENERAL_REGS      0x0000ffffu
148# define ALL_VECTOR_REGS       0xffff0000u
149# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
150#else
151# define ALL_GENERAL_REGS      0x000000ffu
152# define ALL_VECTOR_REGS       0x00ff0000u
153# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
154#endif
155#ifdef CONFIG_SOFTMMU
156# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
157#else
158# define SOFTMMU_RESERVE_REGS  0
159#endif
160
161/* The host compiler should supply <cpuid.h> to enable runtime features
162   detection, as we're not going to go so far as our own inline assembly.
163   If not available, default values will be assumed.  */
164#if defined(CONFIG_CPUID_H)
165#include "qemu/cpuid.h"
166#endif
167
168/* For 64-bit, we always know that CMOV is available.  */
169#if TCG_TARGET_REG_BITS == 64
170# define have_cmov 1
171#elif defined(CONFIG_CPUID_H)
172static bool have_cmov;
173#else
174# define have_cmov 0
175#endif
176
177/* We need these symbols in tcg-target.h, and we can't properly conditionalize
178   it there.  Therefore we always define the variable.  */
179bool have_bmi1;
180bool have_popcnt;
181bool have_avx1;
182bool have_avx2;
183bool have_avx512bw;
184bool have_avx512dq;
185bool have_avx512vbmi2;
186bool have_avx512vl;
187bool have_movbe;
188bool have_atomic16;
189
190#ifdef CONFIG_CPUID_H
191static bool have_bmi2;
192static bool have_lzcnt;
193#else
194# define have_bmi2 0
195# define have_lzcnt 0
196#endif
197
198static const tcg_insn_unit *tb_ret_addr;
199
200static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
201                        intptr_t value, intptr_t addend)
202{
203    value += addend;
204    switch(type) {
205    case R_386_PC32:
206        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
207        if (value != (int32_t)value) {
208            return false;
209        }
210        /* FALLTHRU */
211    case R_386_32:
212        tcg_patch32(code_ptr, value);
213        break;
214    case R_386_PC8:
215        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
216        if (value != (int8_t)value) {
217            return false;
218        }
219        tcg_patch8(code_ptr, value);
220        break;
221    default:
222        g_assert_not_reached();
223    }
224    return true;
225}
226
227/* test if a constant matches the constraint */
228static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
229{
230    if (ct & TCG_CT_CONST) {
231        return 1;
232    }
233    if (type == TCG_TYPE_I32) {
234        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
235            return 1;
236        }
237    } else {
238        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
239            return 1;
240        }
241        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
242            return 1;
243        }
244        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
245            return 1;
246        }
247    }
248    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
249        return 1;
250    }
251    return 0;
252}
253
254# define LOWREGMASK(x)	((x) & 7)
255
256#define P_EXT		0x100		/* 0x0f opcode prefix */
257#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
258#define P_DATA16        0x400           /* 0x66 opcode prefix */
259#define P_VEXW          0x1000          /* Set VEX.W = 1 */
260#if TCG_TARGET_REG_BITS == 64
261# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
262# define P_REXB_R       0x2000          /* REG field as byte register */
263# define P_REXB_RM      0x4000          /* R/M field as byte register */
264# define P_GS           0x8000          /* gs segment override */
265#else
266# define P_REXW		0
267# define P_REXB_R	0
268# define P_REXB_RM	0
269# define P_GS           0
270#endif
271#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
272#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
273#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
274#define P_VEXL          0x80000         /* Set VEX.L = 1 */
275#define P_EVEX          0x100000        /* Requires EVEX encoding */
276
277#define OPC_ARITH_EvIz	(0x81)
278#define OPC_ARITH_EvIb	(0x83)
279#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
280#define OPC_ANDN        (0xf2 | P_EXT38)
281#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
282#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
283#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
284#define OPC_BSF         (0xbc | P_EXT)
285#define OPC_BSR         (0xbd | P_EXT)
286#define OPC_BSWAP	(0xc8 | P_EXT)
287#define OPC_CALL_Jz	(0xe8)
288#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
289#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
290#define OPC_DEC_r32	(0x48)
291#define OPC_IMUL_GvEv	(0xaf | P_EXT)
292#define OPC_IMUL_GvEvIb	(0x6b)
293#define OPC_IMUL_GvEvIz	(0x69)
294#define OPC_INC_r32	(0x40)
295#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
296#define OPC_JCC_short	(0x70)		/* ... plus condition code */
297#define OPC_JMP_long	(0xe9)
298#define OPC_JMP_short	(0xeb)
299#define OPC_LEA         (0x8d)
300#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
301#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
302#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
303#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
304#define OPC_MOVB_EvIz   (0xc6)
305#define OPC_MOVL_EvIz	(0xc7)
306#define OPC_MOVL_Iv     (0xb8)
307#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
308#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
309#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
310#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
311#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
312#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
313#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
314#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
315#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
316#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
317#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
318#define OPC_MOVSBL	(0xbe | P_EXT)
319#define OPC_MOVSWL	(0xbf | P_EXT)
320#define OPC_MOVSLQ	(0x63 | P_REXW)
321#define OPC_MOVZBL	(0xb6 | P_EXT)
322#define OPC_MOVZWL	(0xb7 | P_EXT)
323#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
324#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
325#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
326#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
327#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
328#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
329#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
330#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
331#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
332#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
333#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
334#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
335#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
336#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
337#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
338#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
339#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
340#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
341#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
342#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
343#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
344#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
345#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
346#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
347#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
348#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
349#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
350#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
351#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
352#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
353#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
354#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
355#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
356#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
357#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
358#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
359#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
360#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
361#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
362#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
363#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
364#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
365#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
366#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
367#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
368#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
369#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
370#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
371#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
372#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
373#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
374#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
375#define OPC_POR         (0xeb | P_EXT | P_DATA16)
376#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
377#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
378#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
379#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
380#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
381#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
382#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
383#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
384#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
385#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
386#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
387#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
388#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
389#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
390#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
391#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
392#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
393#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
394#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
395#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
396#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
397#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
398#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
399#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
400#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
401#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
402#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
403#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
404#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
405#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
406#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
407#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
408#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
409#define OPC_POP_r32	(0x58)
410#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
411#define OPC_PUSH_r32	(0x50)
412#define OPC_PUSH_Iv	(0x68)
413#define OPC_PUSH_Ib	(0x6a)
414#define OPC_RET		(0xc3)
415#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
416#define OPC_SHIFT_1	(0xd1)
417#define OPC_SHIFT_Ib	(0xc1)
418#define OPC_SHIFT_cl	(0xd3)
419#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
420#define OPC_SHUFPS      (0xc6 | P_EXT)
421#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
422#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
423#define OPC_SHRD_Ib     (0xac | P_EXT)
424#define OPC_TESTL	(0x85)
425#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
426#define OPC_UD2         (0x0b | P_EXT)
427#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
428#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
429#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
430#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
431#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
432#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
433#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
434#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
435#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
436#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
437#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
438#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
439#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
440#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
442#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
443#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
444#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
445#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
446#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
447#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
448#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
449#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
450#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
451#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
452#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
453#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
454#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
455#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
456#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
457#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
458#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
459#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
460#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
461#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
462#define OPC_VZEROUPPER  (0x77 | P_EXT)
463#define OPC_XCHG_ax_r32	(0x90)
464#define OPC_XCHG_EvGv   (0x87)
465
466#define OPC_GRP3_Eb     (0xf6)
467#define OPC_GRP3_Ev     (0xf7)
468#define OPC_GRP5        (0xff)
469#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
470
471/* Group 1 opcode extensions for 0x80-0x83.
472   These are also used as modifiers for OPC_ARITH.  */
473#define ARITH_ADD 0
474#define ARITH_OR  1
475#define ARITH_ADC 2
476#define ARITH_SBB 3
477#define ARITH_AND 4
478#define ARITH_SUB 5
479#define ARITH_XOR 6
480#define ARITH_CMP 7
481
482/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
483#define SHIFT_ROL 0
484#define SHIFT_ROR 1
485#define SHIFT_SHL 4
486#define SHIFT_SHR 5
487#define SHIFT_SAR 7
488
489/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
490#define EXT3_TESTi 0
491#define EXT3_NOT   2
492#define EXT3_NEG   3
493#define EXT3_MUL   4
494#define EXT3_IMUL  5
495#define EXT3_DIV   6
496#define EXT3_IDIV  7
497
498/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
499#define EXT5_INC_Ev	0
500#define EXT5_DEC_Ev	1
501#define EXT5_CALLN_Ev	2
502#define EXT5_JMPN_Ev	4
503
504/* Condition codes to be added to OPC_JCC_{long,short}.  */
505#define JCC_JMP (-1)
506#define JCC_JO  0x0
507#define JCC_JNO 0x1
508#define JCC_JB  0x2
509#define JCC_JAE 0x3
510#define JCC_JE  0x4
511#define JCC_JNE 0x5
512#define JCC_JBE 0x6
513#define JCC_JA  0x7
514#define JCC_JS  0x8
515#define JCC_JNS 0x9
516#define JCC_JP  0xa
517#define JCC_JNP 0xb
518#define JCC_JL  0xc
519#define JCC_JGE 0xd
520#define JCC_JLE 0xe
521#define JCC_JG  0xf
522
523static const uint8_t tcg_cond_to_jcc[] = {
524    [TCG_COND_EQ] = JCC_JE,
525    [TCG_COND_NE] = JCC_JNE,
526    [TCG_COND_LT] = JCC_JL,
527    [TCG_COND_GE] = JCC_JGE,
528    [TCG_COND_LE] = JCC_JLE,
529    [TCG_COND_GT] = JCC_JG,
530    [TCG_COND_LTU] = JCC_JB,
531    [TCG_COND_GEU] = JCC_JAE,
532    [TCG_COND_LEU] = JCC_JBE,
533    [TCG_COND_GTU] = JCC_JA,
534};
535
536#if TCG_TARGET_REG_BITS == 64
537static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
538{
539    int rex;
540
541    if (opc & P_GS) {
542        tcg_out8(s, 0x65);
543    }
544    if (opc & P_DATA16) {
545        /* We should never be asking for both 16 and 64-bit operation.  */
546        tcg_debug_assert((opc & P_REXW) == 0);
547        tcg_out8(s, 0x66);
548    }
549    if (opc & P_SIMDF3) {
550        tcg_out8(s, 0xf3);
551    } else if (opc & P_SIMDF2) {
552        tcg_out8(s, 0xf2);
553    }
554
555    rex = 0;
556    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
557    rex |= (r & 8) >> 1;                /* REX.R */
558    rex |= (x & 8) >> 2;                /* REX.X */
559    rex |= (rm & 8) >> 3;               /* REX.B */
560
561    /* P_REXB_{R,RM} indicates that the given register is the low byte.
562       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
563       as otherwise the encoding indicates %[abcd]h.  Note that the values
564       that are ORed in merely indicate that the REX byte must be present;
565       those bits get discarded in output.  */
566    rex |= opc & (r >= 4 ? P_REXB_R : 0);
567    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
568
569    if (rex) {
570        tcg_out8(s, (uint8_t)(rex | 0x40));
571    }
572
573    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
574        tcg_out8(s, 0x0f);
575        if (opc & P_EXT38) {
576            tcg_out8(s, 0x38);
577        } else if (opc & P_EXT3A) {
578            tcg_out8(s, 0x3a);
579        }
580    }
581
582    tcg_out8(s, opc);
583}
584#else
585static void tcg_out_opc(TCGContext *s, int opc)
586{
587    if (opc & P_DATA16) {
588        tcg_out8(s, 0x66);
589    }
590    if (opc & P_SIMDF3) {
591        tcg_out8(s, 0xf3);
592    } else if (opc & P_SIMDF2) {
593        tcg_out8(s, 0xf2);
594    }
595    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
596        tcg_out8(s, 0x0f);
597        if (opc & P_EXT38) {
598            tcg_out8(s, 0x38);
599        } else if (opc & P_EXT3A) {
600            tcg_out8(s, 0x3a);
601        }
602    }
603    tcg_out8(s, opc);
604}
605/* Discard the register arguments to tcg_out_opc early, so as not to penalize
606   the 32-bit compilation paths.  This method works with all versions of gcc,
607   whereas relying on optimization may not be able to exclude them.  */
608#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
609#endif
610
611static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
612{
613    tcg_out_opc(s, opc, r, rm, 0);
614    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
615}
616
617static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
618                            int rm, int index)
619{
620    int tmp;
621
622    /* Use the two byte form if possible, which cannot encode
623       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
624    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
625        && ((rm | index) & 8) == 0) {
626        /* Two byte VEX prefix.  */
627        tcg_out8(s, 0xc5);
628
629        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
630    } else {
631        /* Three byte VEX prefix.  */
632        tcg_out8(s, 0xc4);
633
634        /* VEX.m-mmmm */
635        if (opc & P_EXT3A) {
636            tmp = 3;
637        } else if (opc & P_EXT38) {
638            tmp = 2;
639        } else if (opc & P_EXT) {
640            tmp = 1;
641        } else {
642            g_assert_not_reached();
643        }
644        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
645        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
646        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
647        tcg_out8(s, tmp);
648
649        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
650    }
651
652    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
653    /* VEX.pp */
654    if (opc & P_DATA16) {
655        tmp |= 1;                          /* 0x66 */
656    } else if (opc & P_SIMDF3) {
657        tmp |= 2;                          /* 0xf3 */
658    } else if (opc & P_SIMDF2) {
659        tmp |= 3;                          /* 0xf2 */
660    }
661    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
662    tcg_out8(s, tmp);
663    tcg_out8(s, opc);
664}
665
666static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
667                             int rm, int index)
668{
669    /* The entire 4-byte evex prefix; with R' and V' set. */
670    uint32_t p = 0x08041062;
671    int mm, pp;
672
673    tcg_debug_assert(have_avx512vl);
674
675    /* EVEX.mm */
676    if (opc & P_EXT3A) {
677        mm = 3;
678    } else if (opc & P_EXT38) {
679        mm = 2;
680    } else if (opc & P_EXT) {
681        mm = 1;
682    } else {
683        g_assert_not_reached();
684    }
685
686    /* EVEX.pp */
687    if (opc & P_DATA16) {
688        pp = 1;                          /* 0x66 */
689    } else if (opc & P_SIMDF3) {
690        pp = 2;                          /* 0xf3 */
691    } else if (opc & P_SIMDF2) {
692        pp = 3;                          /* 0xf2 */
693    } else {
694        pp = 0;
695    }
696
697    p = deposit32(p, 8, 2, mm);
698    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
699    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
700    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
701    p = deposit32(p, 16, 2, pp);
702    p = deposit32(p, 19, 4, ~v);
703    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
704    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
705
706    tcg_out32(s, p);
707    tcg_out8(s, opc);
708}
709
710static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
711{
712    if (opc & P_EVEX) {
713        tcg_out_evex_opc(s, opc, r, v, rm, 0);
714    } else {
715        tcg_out_vex_opc(s, opc, r, v, rm, 0);
716    }
717    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
718}
719
720/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
721   We handle either RM and INDEX missing with a negative value.  In 64-bit
722   mode for absolute addresses, ~RM is the size of the immediate operand
723   that will follow the instruction.  */
724
725static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
726                               int shift, intptr_t offset)
727{
728    int mod, len;
729
730    if (index < 0 && rm < 0) {
731        if (TCG_TARGET_REG_BITS == 64) {
732            /* Try for a rip-relative addressing mode.  This has replaced
733               the 32-bit-mode absolute addressing encoding.  */
734            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
735            intptr_t disp = offset - pc;
736            if (disp == (int32_t)disp) {
737                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
738                tcg_out32(s, disp);
739                return;
740            }
741
742            /* Try for an absolute address encoding.  This requires the
743               use of the MODRM+SIB encoding and is therefore larger than
744               rip-relative addressing.  */
745            if (offset == (int32_t)offset) {
746                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
747                tcg_out8(s, (4 << 3) | 5);
748                tcg_out32(s, offset);
749                return;
750            }
751
752            /* ??? The memory isn't directly addressable.  */
753            g_assert_not_reached();
754        } else {
755            /* Absolute address.  */
756            tcg_out8(s, (r << 3) | 5);
757            tcg_out32(s, offset);
758            return;
759        }
760    }
761
762    /* Find the length of the immediate addend.  Note that the encoding
763       that would be used for (%ebp) indicates absolute addressing.  */
764    if (rm < 0) {
765        mod = 0, len = 4, rm = 5;
766    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
767        mod = 0, len = 0;
768    } else if (offset == (int8_t)offset) {
769        mod = 0x40, len = 1;
770    } else {
771        mod = 0x80, len = 4;
772    }
773
774    /* Use a single byte MODRM format if possible.  Note that the encoding
775       that would be used for %esp is the escape to the two byte form.  */
776    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
777        /* Single byte MODRM format.  */
778        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
779    } else {
780        /* Two byte MODRM+SIB format.  */
781
782        /* Note that the encoding that would place %esp into the index
783           field indicates no index register.  In 64-bit mode, the REX.X
784           bit counts, so %r12 can be used as the index.  */
785        if (index < 0) {
786            index = 4;
787        } else {
788            tcg_debug_assert(index != TCG_REG_ESP);
789        }
790
791        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
792        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
793    }
794
795    if (len == 1) {
796        tcg_out8(s, offset);
797    } else if (len == 4) {
798        tcg_out32(s, offset);
799    }
800}
801
802static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
803                                     int index, int shift, intptr_t offset)
804{
805    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
806    tcg_out_sib_offset(s, r, rm, index, shift, offset);
807}
808
809static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
810                                         int rm, int index, int shift,
811                                         intptr_t offset)
812{
813    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
814    tcg_out_sib_offset(s, r, rm, index, shift, offset);
815}
816
817/* A simplification of the above with no index or shift.  */
818static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
819                                        int rm, intptr_t offset)
820{
821    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
822}
823
824static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
825                                            int v, int rm, intptr_t offset)
826{
827    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
828}
829
830/* Output an opcode with an expected reference to the constant pool.  */
831static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
832{
833    tcg_out_opc(s, opc, r, 0, 0);
834    /* Absolute for 32-bit, pc-relative for 64-bit.  */
835    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
836    tcg_out32(s, 0);
837}
838
839/* Output an opcode with an expected reference to the constant pool.  */
840static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
841{
842    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
843    /* Absolute for 32-bit, pc-relative for 64-bit.  */
844    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
845    tcg_out32(s, 0);
846}
847
848/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
849static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
850{
851    /* Propagate an opcode prefix, such as P_REXW.  */
852    int ext = subop & ~0x7;
853    subop &= 0x7;
854
855    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
856}
857
858static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
859{
860    int rexw = 0;
861
862    if (arg == ret) {
863        return true;
864    }
865    switch (type) {
866    case TCG_TYPE_I64:
867        rexw = P_REXW;
868        /* fallthru */
869    case TCG_TYPE_I32:
870        if (ret < 16) {
871            if (arg < 16) {
872                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
873            } else {
874                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
875            }
876        } else {
877            if (arg < 16) {
878                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
879            } else {
880                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
881            }
882        }
883        break;
884
885    case TCG_TYPE_V64:
886        tcg_debug_assert(ret >= 16 && arg >= 16);
887        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
888        break;
889    case TCG_TYPE_V128:
890        tcg_debug_assert(ret >= 16 && arg >= 16);
891        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
892        break;
893    case TCG_TYPE_V256:
894        tcg_debug_assert(ret >= 16 && arg >= 16);
895        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
896        break;
897
898    default:
899        g_assert_not_reached();
900    }
901    return true;
902}
903
904static const int avx2_dup_insn[4] = {
905    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
906    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
907};
908
909static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
910                            TCGReg r, TCGReg a)
911{
912    if (have_avx2) {
913        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
914        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
915    } else {
916        switch (vece) {
917        case MO_8:
918            /* ??? With zero in a register, use PSHUFB.  */
919            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
920            a = r;
921            /* FALLTHRU */
922        case MO_16:
923            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
924            a = r;
925            /* FALLTHRU */
926        case MO_32:
927            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
928            /* imm8 operand: all output lanes selected from input lane 0.  */
929            tcg_out8(s, 0);
930            break;
931        case MO_64:
932            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
933            break;
934        default:
935            g_assert_not_reached();
936        }
937    }
938    return true;
939}
940
941static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
942                             TCGReg r, TCGReg base, intptr_t offset)
943{
944    if (have_avx2) {
945        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
946        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
947                                 r, 0, base, offset);
948    } else {
949        switch (vece) {
950        case MO_64:
951            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
952            break;
953        case MO_32:
954            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
955            break;
956        case MO_16:
957            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
958            tcg_out8(s, 0); /* imm8 */
959            tcg_out_dup_vec(s, type, vece, r, r);
960            break;
961        case MO_8:
962            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
963            tcg_out8(s, 0); /* imm8 */
964            tcg_out_dup_vec(s, type, vece, r, r);
965            break;
966        default:
967            g_assert_not_reached();
968        }
969    }
970    return true;
971}
972
973static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
974                             TCGReg ret, int64_t arg)
975{
976    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
977
978    if (arg == 0) {
979        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
980        return;
981    }
982    if (arg == -1) {
983        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
984        return;
985    }
986
987    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
988        if (have_avx2) {
989            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
990        } else {
991            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
992        }
993        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
994    } else {
995        if (type == TCG_TYPE_V64) {
996            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
997        } else if (have_avx2) {
998            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
999        } else {
1000            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1001        }
1002        if (TCG_TARGET_REG_BITS == 64) {
1003            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004        } else {
1005            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1006        }
1007    }
1008}
1009
1010static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1011                             TCGReg ret, tcg_target_long arg)
1012{
1013    if (arg == 0) {
1014        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1015        return;
1016    }
1017    if (arg == -1) {
1018        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1019        return;
1020    }
1021
1022    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1023    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1024    if (TCG_TARGET_REG_BITS == 64) {
1025        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1026    } else {
1027        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1028    }
1029}
1030
1031static void tcg_out_movi_int(TCGContext *s, TCGType type,
1032                             TCGReg ret, tcg_target_long arg)
1033{
1034    tcg_target_long diff;
1035
1036    if (arg == 0) {
1037        tgen_arithr(s, ARITH_XOR, ret, ret);
1038        return;
1039    }
1040    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1041        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1042        tcg_out32(s, arg);
1043        return;
1044    }
1045    if (arg == (int32_t)arg) {
1046        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1047        tcg_out32(s, arg);
1048        return;
1049    }
1050
1051    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1052    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1053    if (diff == (int32_t)diff) {
1054        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1055        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1056        tcg_out32(s, diff);
1057        return;
1058    }
1059
1060    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1061    tcg_out64(s, arg);
1062}
1063
1064static void tcg_out_movi(TCGContext *s, TCGType type,
1065                         TCGReg ret, tcg_target_long arg)
1066{
1067    switch (type) {
1068    case TCG_TYPE_I32:
1069#if TCG_TARGET_REG_BITS == 64
1070    case TCG_TYPE_I64:
1071#endif
1072        if (ret < 16) {
1073            tcg_out_movi_int(s, type, ret, arg);
1074        } else {
1075            tcg_out_movi_vec(s, type, ret, arg);
1076        }
1077        break;
1078    default:
1079        g_assert_not_reached();
1080    }
1081}
1082
1083static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1084{
1085    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1086    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1087    return true;
1088}
1089
1090static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1091                             tcg_target_long imm)
1092{
1093    /* This function is only used for passing structs by reference. */
1094    tcg_debug_assert(imm == (int32_t)imm);
1095    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1096}
1097
1098static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1099{
1100    if (val == (int8_t)val) {
1101        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1102        tcg_out8(s, val);
1103    } else if (val == (int32_t)val) {
1104        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1105        tcg_out32(s, val);
1106    } else {
1107        g_assert_not_reached();
1108    }
1109}
1110
1111static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1112{
1113    /* Given the strength of x86 memory ordering, we only need care for
1114       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1115       faster than "mfence", so don't bother with the sse insn.  */
1116    if (a0 & TCG_MO_ST_LD) {
1117        tcg_out8(s, 0xf0);
1118        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1119        tcg_out8(s, 0);
1120    }
1121}
1122
1123static inline void tcg_out_push(TCGContext *s, int reg)
1124{
1125    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1126}
1127
1128static inline void tcg_out_pop(TCGContext *s, int reg)
1129{
1130    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1131}
1132
1133static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1134                       TCGReg arg1, intptr_t arg2)
1135{
1136    switch (type) {
1137    case TCG_TYPE_I32:
1138        if (ret < 16) {
1139            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1140        } else {
1141            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1142        }
1143        break;
1144    case TCG_TYPE_I64:
1145        if (ret < 16) {
1146            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1147            break;
1148        }
1149        /* FALLTHRU */
1150    case TCG_TYPE_V64:
1151        /* There is no instruction that can validate 8-byte alignment.  */
1152        tcg_debug_assert(ret >= 16);
1153        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1154        break;
1155    case TCG_TYPE_V128:
1156        /*
1157         * The gvec infrastructure is asserts that v128 vector loads
1158         * and stores use a 16-byte aligned offset.  Validate that the
1159         * final pointer is aligned by using an insn that will SIGSEGV.
1160         */
1161        tcg_debug_assert(ret >= 16);
1162        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1163        break;
1164    case TCG_TYPE_V256:
1165        /*
1166         * The gvec infrastructure only requires 16-byte alignment,
1167         * so here we must use an unaligned load.
1168         */
1169        tcg_debug_assert(ret >= 16);
1170        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1171                                 ret, 0, arg1, arg2);
1172        break;
1173    default:
1174        g_assert_not_reached();
1175    }
1176}
1177
1178static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1179                       TCGReg arg1, intptr_t arg2)
1180{
1181    switch (type) {
1182    case TCG_TYPE_I32:
1183        if (arg < 16) {
1184            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1185        } else {
1186            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1187        }
1188        break;
1189    case TCG_TYPE_I64:
1190        if (arg < 16) {
1191            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1192            break;
1193        }
1194        /* FALLTHRU */
1195    case TCG_TYPE_V64:
1196        /* There is no instruction that can validate 8-byte alignment.  */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1199        break;
1200    case TCG_TYPE_V128:
1201        /*
1202         * The gvec infrastructure is asserts that v128 vector loads
1203         * and stores use a 16-byte aligned offset.  Validate that the
1204         * final pointer is aligned by using an insn that will SIGSEGV.
1205         *
1206         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1207         * for _WIN64, which must have SSE2 but may not have AVX.
1208         */
1209        tcg_debug_assert(arg >= 16);
1210        if (have_avx1) {
1211            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1212        } else {
1213            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1214        }
1215        break;
1216    case TCG_TYPE_V256:
1217        /*
1218         * The gvec infrastructure only requires 16-byte alignment,
1219         * so here we must use an unaligned store.
1220         */
1221        tcg_debug_assert(arg >= 16);
1222        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1223                                 arg, 0, arg1, arg2);
1224        break;
1225    default:
1226        g_assert_not_reached();
1227    }
1228}
1229
1230static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1231                        TCGReg base, intptr_t ofs)
1232{
1233    int rexw = 0;
1234    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1235        if (val != (int32_t)val) {
1236            return false;
1237        }
1238        rexw = P_REXW;
1239    } else if (type != TCG_TYPE_I32) {
1240        return false;
1241    }
1242    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1243    tcg_out32(s, val);
1244    return true;
1245}
1246
1247static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1248{
1249    /* Propagate an opcode prefix, such as P_DATA16.  */
1250    int ext = subopc & ~0x7;
1251    subopc &= 0x7;
1252
1253    if (count == 1) {
1254        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1255    } else {
1256        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1257        tcg_out8(s, count);
1258    }
1259}
1260
1261static inline void tcg_out_bswap32(TCGContext *s, int reg)
1262{
1263    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1264}
1265
1266static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1267{
1268    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1269}
1270
1271static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1272{
1273    /* movzbl */
1274    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1275    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1276}
1277
1278static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1279{
1280    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1281    /* movsbl */
1282    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1283    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1284}
1285
1286static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1287{
1288    /* movzwl */
1289    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1290}
1291
1292static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1293{
1294    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1295    /* movsw[lq] */
1296    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1297}
1298
1299static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    /* 32-bit mov zero extends.  */
1302    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1303}
1304
1305static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1306{
1307    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1308    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1309}
1310
1311static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1312{
1313    tcg_out_ext32s(s, dest, src);
1314}
1315
1316static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1317{
1318    if (dest != src) {
1319        tcg_out_ext32u(s, dest, src);
1320    }
1321}
1322
1323static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1324{
1325    tcg_out_ext32u(s, dest, src);
1326}
1327
1328static inline void tcg_out_bswap64(TCGContext *s, int reg)
1329{
1330    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1331}
1332
1333static void tgen_arithi(TCGContext *s, int c, int r0,
1334                        tcg_target_long val, int cf)
1335{
1336    int rexw = 0;
1337
1338    if (TCG_TARGET_REG_BITS == 64) {
1339        rexw = c & -8;
1340        c &= 7;
1341    }
1342
1343    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1344       partial flags update stalls on Pentium4 and are not recommended
1345       by current Intel optimization manuals.  */
1346    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1347        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1348        if (TCG_TARGET_REG_BITS == 64) {
1349            /* The single-byte increment encodings are re-tasked as the
1350               REX prefixes.  Use the MODRM encoding.  */
1351            tcg_out_modrm(s, OPC_GRP5 + rexw,
1352                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1353        } else {
1354            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1355        }
1356        return;
1357    }
1358
1359    if (c == ARITH_AND) {
1360        if (TCG_TARGET_REG_BITS == 64) {
1361            if (val == 0xffffffffu) {
1362                tcg_out_ext32u(s, r0, r0);
1363                return;
1364            }
1365            if (val == (uint32_t)val) {
1366                /* AND with no high bits set can use a 32-bit operation.  */
1367                rexw = 0;
1368            }
1369        }
1370        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1371            tcg_out_ext8u(s, r0, r0);
1372            return;
1373        }
1374        if (val == 0xffffu) {
1375            tcg_out_ext16u(s, r0, r0);
1376            return;
1377        }
1378    }
1379
1380    if (val == (int8_t)val) {
1381        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1382        tcg_out8(s, val);
1383        return;
1384    }
1385    if (rexw == 0 || val == (int32_t)val) {
1386        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1387        tcg_out32(s, val);
1388        return;
1389    }
1390
1391    g_assert_not_reached();
1392}
1393
1394static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1395{
1396    if (val != 0) {
1397        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1398    }
1399}
1400
1401/* Set SMALL to force a short forward branch.  */
1402static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1403{
1404    int32_t val, val1;
1405
1406    if (l->has_value) {
1407        val = tcg_pcrel_diff(s, l->u.value_ptr);
1408        val1 = val - 2;
1409        if ((int8_t)val1 == val1) {
1410            if (opc == -1) {
1411                tcg_out8(s, OPC_JMP_short);
1412            } else {
1413                tcg_out8(s, OPC_JCC_short + opc);
1414            }
1415            tcg_out8(s, val1);
1416        } else {
1417            tcg_debug_assert(!small);
1418            if (opc == -1) {
1419                tcg_out8(s, OPC_JMP_long);
1420                tcg_out32(s, val - 5);
1421            } else {
1422                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1423                tcg_out32(s, val - 6);
1424            }
1425        }
1426    } else if (small) {
1427        if (opc == -1) {
1428            tcg_out8(s, OPC_JMP_short);
1429        } else {
1430            tcg_out8(s, OPC_JCC_short + opc);
1431        }
1432        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1433        s->code_ptr += 1;
1434    } else {
1435        if (opc == -1) {
1436            tcg_out8(s, OPC_JMP_long);
1437        } else {
1438            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1439        }
1440        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1441        s->code_ptr += 4;
1442    }
1443}
1444
1445static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1446                        int const_arg2, int rexw)
1447{
1448    if (const_arg2) {
1449        if (arg2 == 0) {
1450            /* test r, r */
1451            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1452        } else {
1453            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1454        }
1455    } else {
1456        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1457    }
1458}
1459
1460static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1461                             TCGArg arg1, TCGArg arg2, int const_arg2,
1462                             TCGLabel *label, int small)
1463{
1464    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1465    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1466}
1467
1468#if TCG_TARGET_REG_BITS == 64
1469static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1470                             TCGArg arg1, TCGArg arg2, int const_arg2,
1471                             TCGLabel *label, int small)
1472{
1473    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1474    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1475}
1476#else
1477/* XXX: we implement it at the target level to avoid having to
1478   handle cross basic blocks temporaries */
1479static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1480                            const int *const_args, int small)
1481{
1482    TCGLabel *label_next = gen_new_label();
1483    TCGLabel *label_this = arg_label(args[5]);
1484
1485    switch(args[4]) {
1486    case TCG_COND_EQ:
1487        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1488                         label_next, 1);
1489        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1490                         label_this, small);
1491        break;
1492    case TCG_COND_NE:
1493        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1494                         label_this, small);
1495        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1496                         label_this, small);
1497        break;
1498    case TCG_COND_LT:
1499        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1500                         label_this, small);
1501        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1502        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1503                         label_this, small);
1504        break;
1505    case TCG_COND_LE:
1506        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1507                         label_this, small);
1508        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1509        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1510                         label_this, small);
1511        break;
1512    case TCG_COND_GT:
1513        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1514                         label_this, small);
1515        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1516        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1517                         label_this, small);
1518        break;
1519    case TCG_COND_GE:
1520        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1521                         label_this, small);
1522        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1523        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1524                         label_this, small);
1525        break;
1526    case TCG_COND_LTU:
1527        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1528                         label_this, small);
1529        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1530        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1531                         label_this, small);
1532        break;
1533    case TCG_COND_LEU:
1534        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1535                         label_this, small);
1536        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1537        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1538                         label_this, small);
1539        break;
1540    case TCG_COND_GTU:
1541        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1542                         label_this, small);
1543        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1544        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1545                         label_this, small);
1546        break;
1547    case TCG_COND_GEU:
1548        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1549                         label_this, small);
1550        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1551        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1552                         label_this, small);
1553        break;
1554    default:
1555        g_assert_not_reached();
1556    }
1557    tcg_out_label(s, label_next);
1558}
1559#endif
1560
1561static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1562                              TCGArg arg1, TCGArg arg2, int const_arg2)
1563{
1564    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1565    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1566    tcg_out_ext8u(s, dest, dest);
1567}
1568
1569#if TCG_TARGET_REG_BITS == 64
1570static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1571                              TCGArg arg1, TCGArg arg2, int const_arg2)
1572{
1573    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1574    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1575    tcg_out_ext8u(s, dest, dest);
1576}
1577#else
1578static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1579                             const int *const_args)
1580{
1581    TCGArg new_args[6];
1582    TCGLabel *label_true, *label_over;
1583
1584    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1585
1586    if (args[0] == args[1] || args[0] == args[2]
1587        || (!const_args[3] && args[0] == args[3])
1588        || (!const_args[4] && args[0] == args[4])) {
1589        /* When the destination overlaps with one of the argument
1590           registers, don't do anything tricky.  */
1591        label_true = gen_new_label();
1592        label_over = gen_new_label();
1593
1594        new_args[5] = label_arg(label_true);
1595        tcg_out_brcond2(s, new_args, const_args+1, 1);
1596
1597        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1598        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1599        tcg_out_label(s, label_true);
1600
1601        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1602        tcg_out_label(s, label_over);
1603    } else {
1604        /* When the destination does not overlap one of the arguments,
1605           clear the destination first, jump if cond false, and emit an
1606           increment in the true case.  This results in smaller code.  */
1607
1608        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1609
1610        label_over = gen_new_label();
1611        new_args[4] = tcg_invert_cond(new_args[4]);
1612        new_args[5] = label_arg(label_over);
1613        tcg_out_brcond2(s, new_args, const_args+1, 1);
1614
1615        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1616        tcg_out_label(s, label_over);
1617    }
1618}
1619#endif
1620
1621static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1622                         TCGReg dest, TCGReg v1)
1623{
1624    if (have_cmov) {
1625        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1626    } else {
1627        TCGLabel *over = gen_new_label();
1628        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1629        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1630        tcg_out_label(s, over);
1631    }
1632}
1633
1634static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1635                              TCGReg c1, TCGArg c2, int const_c2,
1636                              TCGReg v1)
1637{
1638    tcg_out_cmp(s, c1, c2, const_c2, 0);
1639    tcg_out_cmov(s, cond, 0, dest, v1);
1640}
1641
1642#if TCG_TARGET_REG_BITS == 64
1643static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1644                              TCGReg c1, TCGArg c2, int const_c2,
1645                              TCGReg v1)
1646{
1647    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1648    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1649}
1650#endif
1651
1652static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1653                        TCGArg arg2, bool const_a2)
1654{
1655    if (have_bmi1) {
1656        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1657        if (const_a2) {
1658            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1659        } else {
1660            tcg_debug_assert(dest != arg2);
1661            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1662        }
1663    } else {
1664        tcg_debug_assert(dest != arg2);
1665        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1666        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1667    }
1668}
1669
1670static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1671                        TCGArg arg2, bool const_a2)
1672{
1673    if (have_lzcnt) {
1674        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1675        if (const_a2) {
1676            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1677        } else {
1678            tcg_debug_assert(dest != arg2);
1679            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1680        }
1681    } else {
1682        tcg_debug_assert(!const_a2);
1683        tcg_debug_assert(dest != arg1);
1684        tcg_debug_assert(dest != arg2);
1685
1686        /* Recall that the output of BSR is the index not the count.  */
1687        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1688        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1689
1690        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1691        tcg_out_cmp(s, arg1, 0, 1, rexw);
1692        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1693    }
1694}
1695
1696static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1697{
1698    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1699
1700    if (disp == (int32_t)disp) {
1701        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1702        tcg_out32(s, disp);
1703    } else {
1704        /* rip-relative addressing into the constant pool.
1705           This is 6 + 8 = 14 bytes, as compared to using an
1706           immediate load 10 + 6 = 16 bytes, plus we may
1707           be able to re-use the pool constant for more calls.  */
1708        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1709        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1710        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1711        tcg_out32(s, 0);
1712    }
1713}
1714
1715static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1716                         const TCGHelperInfo *info)
1717{
1718    tcg_out_branch(s, 1, dest);
1719
1720#ifndef _WIN32
1721    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1722        /*
1723         * The sysv i386 abi for struct return places a reference as the
1724         * first argument of the stack, and pops that argument with the
1725         * return statement.  Since we want to retain the aligned stack
1726         * pointer for the callee, we do not want to actually push that
1727         * argument before the call but rely on the normal store to the
1728         * stack slot.  But we do need to compensate for the pop in order
1729         * to reset our correct stack pointer value.
1730         * Pushing a garbage value back onto the stack is quickest.
1731         */
1732        tcg_out_push(s, TCG_REG_EAX);
1733    }
1734#endif
1735}
1736
1737static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1738{
1739    tcg_out_branch(s, 0, dest);
1740}
1741
1742static void tcg_out_nopn(TCGContext *s, int n)
1743{
1744    int i;
1745    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1746     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1747     * duplicate prefix, and all of the interesting recent cores can
1748     * decode and discard the duplicates in a single cycle.
1749     */
1750    tcg_debug_assert(n >= 1);
1751    for (i = 1; i < n; ++i) {
1752        tcg_out8(s, 0x66);
1753    }
1754    tcg_out8(s, 0x90);
1755}
1756
1757/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1758static void __attribute__((unused))
1759tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1760{
1761    /*
1762     * This is used for testing alignment, so we can usually use testb.
1763     * For i686, we have to use testl for %esi/%edi.
1764     */
1765    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1766        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1767        tcg_out8(s, i);
1768    } else {
1769        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1770        tcg_out32(s, i);
1771    }
1772}
1773
1774typedef struct {
1775    TCGReg base;
1776    int index;
1777    int ofs;
1778    int seg;
1779    TCGAtomAlign aa;
1780} HostAddress;
1781
1782bool tcg_target_has_memory_bswap(MemOp memop)
1783{
1784    return have_movbe;
1785}
1786
1787/*
1788 * Because i686 has no register parameters and because x86_64 has xchg
1789 * to handle addr/data register overlap, we have placed all input arguments
1790 * before we need might need a scratch reg.
1791 *
1792 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1793 * a general-purpose scratch when we don't actually know it's available,
1794 * use the ra_gen hook to load into RAX if needed.
1795 */
1796#if TCG_TARGET_REG_BITS == 64
1797static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1798{
1799    if (arg < 0) {
1800        arg = TCG_REG_RAX;
1801    }
1802    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1803    return arg;
1804}
1805static const TCGLdstHelperParam ldst_helper_param = {
1806    .ra_gen = ldst_ra_gen
1807};
1808#else
1809static const TCGLdstHelperParam ldst_helper_param = { };
1810#endif
1811
1812/*
1813 * Generate code for the slow path for a load at the end of block
1814 */
1815static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1816{
1817    MemOp opc = get_memop(l->oi);
1818    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1819
1820    /* resolve label address */
1821    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1822    if (label_ptr[1]) {
1823        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1824    }
1825
1826    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1827    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1828    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1829
1830    tcg_out_jmp(s, l->raddr);
1831    return true;
1832}
1833
1834/*
1835 * Generate code for the slow path for a store at the end of block
1836 */
1837static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1838{
1839    MemOp opc = get_memop(l->oi);
1840    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1841
1842    /* resolve label address */
1843    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1844    if (label_ptr[1]) {
1845        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1846    }
1847
1848    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1849    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1850
1851    tcg_out_jmp(s, l->raddr);
1852    return true;
1853}
1854
1855#ifndef CONFIG_SOFTMMU
1856static HostAddress x86_guest_base = {
1857    .index = -1
1858};
1859
1860#if defined(__x86_64__) && defined(__linux__)
1861# include <asm/prctl.h>
1862# include <sys/prctl.h>
1863int arch_prctl(int code, unsigned long addr);
1864static inline int setup_guest_base_seg(void)
1865{
1866    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1867        return P_GS;
1868    }
1869    return 0;
1870}
1871#elif defined(__x86_64__) && \
1872      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1873# include <machine/sysarch.h>
1874static inline int setup_guest_base_seg(void)
1875{
1876    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1877        return P_GS;
1878    }
1879    return 0;
1880}
1881#else
1882static inline int setup_guest_base_seg(void)
1883{
1884    return 0;
1885}
1886#endif /* setup_guest_base_seg */
1887#endif /* !SOFTMMU */
1888
1889/*
1890 * For softmmu, perform the TLB load and compare.
1891 * For useronly, perform any required alignment tests.
1892 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1893 * is required and fill in @h with the host address for the fast path.
1894 */
1895static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1896                                           TCGReg addrlo, TCGReg addrhi,
1897                                           MemOpIdx oi, bool is_ld)
1898{
1899    TCGLabelQemuLdst *ldst = NULL;
1900    MemOp opc = get_memop(oi);
1901    unsigned a_mask;
1902
1903#ifdef CONFIG_SOFTMMU
1904    h->index = TCG_REG_L0;
1905    h->ofs = 0;
1906    h->seg = 0;
1907#else
1908    *h = x86_guest_base;
1909#endif
1910    h->base = addrlo;
1911    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
1912    a_mask = (1 << h->aa.align) - 1;
1913
1914#ifdef CONFIG_SOFTMMU
1915    int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1916                        : offsetof(CPUTLBEntry, addr_write);
1917    TCGType ttype = TCG_TYPE_I32;
1918    TCGType tlbtype = TCG_TYPE_I32;
1919    int trexw = 0, hrexw = 0, tlbrexw = 0;
1920    unsigned mem_index = get_mmuidx(oi);
1921    unsigned s_bits = opc & MO_SIZE;
1922    unsigned s_mask = (1 << s_bits) - 1;
1923    int tlb_mask;
1924
1925    ldst = new_ldst_label(s);
1926    ldst->is_ld = is_ld;
1927    ldst->oi = oi;
1928    ldst->addrlo_reg = addrlo;
1929    ldst->addrhi_reg = addrhi;
1930
1931    if (TCG_TARGET_REG_BITS == 64) {
1932        ttype = s->addr_type;
1933        trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
1934        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1935            hrexw = P_REXW;
1936            if (s->page_bits + s->tlb_dyn_max_bits > 32) {
1937                tlbtype = TCG_TYPE_I64;
1938                tlbrexw = P_REXW;
1939            }
1940        }
1941    }
1942
1943    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1944    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1945                   s->page_bits - CPU_TLB_ENTRY_BITS);
1946
1947    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1948                         TLB_MASK_TABLE_OFS(mem_index) +
1949                         offsetof(CPUTLBDescFast, mask));
1950
1951    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1952                         TLB_MASK_TABLE_OFS(mem_index) +
1953                         offsetof(CPUTLBDescFast, table));
1954
1955    /*
1956     * If the required alignment is at least as large as the access, simply
1957     * copy the address and mask.  For lesser alignments, check that we don't
1958     * cross pages for the complete access.
1959     */
1960    if (a_mask >= s_mask) {
1961        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1962    } else {
1963        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1964                             addrlo, s_mask - a_mask);
1965    }
1966    tlb_mask = s->page_mask | a_mask;
1967    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1968
1969    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1970    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1971                         TCG_REG_L1, TCG_REG_L0, cmp_ofs);
1972
1973    /* jne slow_path */
1974    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1975    ldst->label_ptr[0] = s->code_ptr;
1976    s->code_ptr += 4;
1977
1978    if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
1979        /* cmp 4(TCG_REG_L0), addrhi */
1980        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
1981
1982        /* jne slow_path */
1983        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1984        ldst->label_ptr[1] = s->code_ptr;
1985        s->code_ptr += 4;
1986    }
1987
1988    /* TLB Hit.  */
1989    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
1990               offsetof(CPUTLBEntry, addend));
1991#else
1992    if (a_mask) {
1993        ldst = new_ldst_label(s);
1994
1995        ldst->is_ld = is_ld;
1996        ldst->oi = oi;
1997        ldst->addrlo_reg = addrlo;
1998        ldst->addrhi_reg = addrhi;
1999
2000        tcg_out_testi(s, addrlo, a_mask);
2001        /* jne slow_path */
2002        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2003        ldst->label_ptr[0] = s->code_ptr;
2004        s->code_ptr += 4;
2005    }
2006#endif
2007
2008    return ldst;
2009}
2010
2011static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2012                                   HostAddress h, TCGType type, MemOp memop)
2013{
2014    bool use_movbe = false;
2015    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2016    int movop = OPC_MOVL_GvEv;
2017
2018    /* Do big-endian loads with movbe.  */
2019    if (memop & MO_BSWAP) {
2020        tcg_debug_assert(have_movbe);
2021        use_movbe = true;
2022        movop = OPC_MOVBE_GyMy;
2023    }
2024
2025    switch (memop & MO_SSIZE) {
2026    case MO_UB:
2027        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2028                                 h.base, h.index, 0, h.ofs);
2029        break;
2030    case MO_SB:
2031        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2032                                 h.base, h.index, 0, h.ofs);
2033        break;
2034    case MO_UW:
2035        if (use_movbe) {
2036            /* There is no extending movbe; only low 16-bits are modified.  */
2037            if (datalo != h.base && datalo != h.index) {
2038                /* XOR breaks dependency chains.  */
2039                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2040                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2041                                         datalo, h.base, h.index, 0, h.ofs);
2042            } else {
2043                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2044                                         datalo, h.base, h.index, 0, h.ofs);
2045                tcg_out_ext16u(s, datalo, datalo);
2046            }
2047        } else {
2048            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2049                                     h.base, h.index, 0, h.ofs);
2050        }
2051        break;
2052    case MO_SW:
2053        if (use_movbe) {
2054            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2055                                     datalo, h.base, h.index, 0, h.ofs);
2056            tcg_out_ext16s(s, type, datalo, datalo);
2057        } else {
2058            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2059                                     datalo, h.base, h.index, 0, h.ofs);
2060        }
2061        break;
2062    case MO_UL:
2063        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2064                                 h.base, h.index, 0, h.ofs);
2065        break;
2066#if TCG_TARGET_REG_BITS == 64
2067    case MO_SL:
2068        if (use_movbe) {
2069            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2070                                     h.base, h.index, 0, h.ofs);
2071            tcg_out_ext32s(s, datalo, datalo);
2072        } else {
2073            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2074                                     h.base, h.index, 0, h.ofs);
2075        }
2076        break;
2077#endif
2078    case MO_UQ:
2079        if (TCG_TARGET_REG_BITS == 64) {
2080            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2081                                     h.base, h.index, 0, h.ofs);
2082            break;
2083        }
2084        if (use_movbe) {
2085            TCGReg t = datalo;
2086            datalo = datahi;
2087            datahi = t;
2088        }
2089        if (h.base == datalo || h.index == datalo) {
2090            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2091                                     h.base, h.index, 0, h.ofs);
2092            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2093            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2094        } else {
2095            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2096                                     h.base, h.index, 0, h.ofs);
2097            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2098                                     h.base, h.index, 0, h.ofs + 4);
2099        }
2100        break;
2101    default:
2102        g_assert_not_reached();
2103    }
2104}
2105
2106static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2107                            TCGReg addrlo, TCGReg addrhi,
2108                            MemOpIdx oi, TCGType data_type)
2109{
2110    TCGLabelQemuLdst *ldst;
2111    HostAddress h;
2112
2113    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2114    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2115
2116    if (ldst) {
2117        ldst->type = data_type;
2118        ldst->datalo_reg = datalo;
2119        ldst->datahi_reg = datahi;
2120        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2121    }
2122}
2123
2124static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2125                                   HostAddress h, MemOp memop)
2126{
2127    bool use_movbe = false;
2128    int movop = OPC_MOVL_EvGv;
2129
2130    /*
2131     * Do big-endian stores with movbe or softmmu.
2132     * User-only without movbe will have its swapping done generically.
2133     */
2134    if (memop & MO_BSWAP) {
2135        tcg_debug_assert(have_movbe);
2136        use_movbe = true;
2137        movop = OPC_MOVBE_MyGy;
2138    }
2139
2140    switch (memop & MO_SIZE) {
2141    case MO_8:
2142        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2143        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2144        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2145                                 datalo, h.base, h.index, 0, h.ofs);
2146        break;
2147    case MO_16:
2148        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2149                                 h.base, h.index, 0, h.ofs);
2150        break;
2151    case MO_32:
2152        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2153                                 h.base, h.index, 0, h.ofs);
2154        break;
2155    case MO_64:
2156        if (TCG_TARGET_REG_BITS == 64) {
2157            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2158                                     h.base, h.index, 0, h.ofs);
2159        } else {
2160            if (use_movbe) {
2161                TCGReg t = datalo;
2162                datalo = datahi;
2163                datahi = t;
2164            }
2165            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2166                                     h.base, h.index, 0, h.ofs);
2167            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2168                                     h.base, h.index, 0, h.ofs + 4);
2169        }
2170        break;
2171    default:
2172        g_assert_not_reached();
2173    }
2174}
2175
2176static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2177                            TCGReg addrlo, TCGReg addrhi,
2178                            MemOpIdx oi, TCGType data_type)
2179{
2180    TCGLabelQemuLdst *ldst;
2181    HostAddress h;
2182
2183    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2184    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2185
2186    if (ldst) {
2187        ldst->type = data_type;
2188        ldst->datalo_reg = datalo;
2189        ldst->datahi_reg = datahi;
2190        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2191    }
2192}
2193
2194static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2195{
2196    /* Reuse the zeroing that exists for goto_ptr.  */
2197    if (a0 == 0) {
2198        tcg_out_jmp(s, tcg_code_gen_epilogue);
2199    } else {
2200        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2201        tcg_out_jmp(s, tb_ret_addr);
2202    }
2203}
2204
2205static void tcg_out_goto_tb(TCGContext *s, int which)
2206{
2207    /*
2208     * Jump displacement must be aligned for atomic patching;
2209     * see if we need to add extra nops before jump
2210     */
2211    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2212    if (gap != 1) {
2213        tcg_out_nopn(s, gap - 1);
2214    }
2215    tcg_out8(s, OPC_JMP_long); /* jmp im */
2216    set_jmp_insn_offset(s, which);
2217    tcg_out32(s, 0);
2218    set_jmp_reset_offset(s, which);
2219}
2220
2221void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2222                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2223{
2224    /* patch the branch destination */
2225    uintptr_t addr = tb->jmp_target_addr[n];
2226    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2227    /* no need to flush icache explicitly */
2228}
2229
2230static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2231                              const TCGArg args[TCG_MAX_OP_ARGS],
2232                              const int const_args[TCG_MAX_OP_ARGS])
2233{
2234    TCGArg a0, a1, a2;
2235    int c, const_a2, vexop, rexw = 0;
2236
2237#if TCG_TARGET_REG_BITS == 64
2238# define OP_32_64(x) \
2239        case glue(glue(INDEX_op_, x), _i64): \
2240            rexw = P_REXW; /* FALLTHRU */    \
2241        case glue(glue(INDEX_op_, x), _i32)
2242#else
2243# define OP_32_64(x) \
2244        case glue(glue(INDEX_op_, x), _i32)
2245#endif
2246
2247    /* Hoist the loads of the most common arguments.  */
2248    a0 = args[0];
2249    a1 = args[1];
2250    a2 = args[2];
2251    const_a2 = const_args[2];
2252
2253    switch (opc) {
2254    case INDEX_op_goto_ptr:
2255        /* jmp to the given host address (could be epilogue) */
2256        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2257        break;
2258    case INDEX_op_br:
2259        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2260        break;
2261    OP_32_64(ld8u):
2262        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2263        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2264        break;
2265    OP_32_64(ld8s):
2266        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2267        break;
2268    OP_32_64(ld16u):
2269        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2270        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2271        break;
2272    OP_32_64(ld16s):
2273        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2274        break;
2275#if TCG_TARGET_REG_BITS == 64
2276    case INDEX_op_ld32u_i64:
2277#endif
2278    case INDEX_op_ld_i32:
2279        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2280        break;
2281
2282    OP_32_64(st8):
2283        if (const_args[0]) {
2284            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2285            tcg_out8(s, a0);
2286        } else {
2287            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2288        }
2289        break;
2290    OP_32_64(st16):
2291        if (const_args[0]) {
2292            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2293            tcg_out16(s, a0);
2294        } else {
2295            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2296        }
2297        break;
2298#if TCG_TARGET_REG_BITS == 64
2299    case INDEX_op_st32_i64:
2300#endif
2301    case INDEX_op_st_i32:
2302        if (const_args[0]) {
2303            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2304            tcg_out32(s, a0);
2305        } else {
2306            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2307        }
2308        break;
2309
2310    OP_32_64(add):
2311        /* For 3-operand addition, use LEA.  */
2312        if (a0 != a1) {
2313            TCGArg c3 = 0;
2314            if (const_a2) {
2315                c3 = a2, a2 = -1;
2316            } else if (a0 == a2) {
2317                /* Watch out for dest = src + dest, since we've removed
2318                   the matching constraint on the add.  */
2319                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2320                break;
2321            }
2322
2323            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2324            break;
2325        }
2326        c = ARITH_ADD;
2327        goto gen_arith;
2328    OP_32_64(sub):
2329        c = ARITH_SUB;
2330        goto gen_arith;
2331    OP_32_64(and):
2332        c = ARITH_AND;
2333        goto gen_arith;
2334    OP_32_64(or):
2335        c = ARITH_OR;
2336        goto gen_arith;
2337    OP_32_64(xor):
2338        c = ARITH_XOR;
2339        goto gen_arith;
2340    gen_arith:
2341        if (const_a2) {
2342            tgen_arithi(s, c + rexw, a0, a2, 0);
2343        } else {
2344            tgen_arithr(s, c + rexw, a0, a2);
2345        }
2346        break;
2347
2348    OP_32_64(andc):
2349        if (const_a2) {
2350            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2351            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2352        } else {
2353            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2354        }
2355        break;
2356
2357    OP_32_64(mul):
2358        if (const_a2) {
2359            int32_t val;
2360            val = a2;
2361            if (val == (int8_t)val) {
2362                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2363                tcg_out8(s, val);
2364            } else {
2365                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2366                tcg_out32(s, val);
2367            }
2368        } else {
2369            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2370        }
2371        break;
2372
2373    OP_32_64(div2):
2374        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2375        break;
2376    OP_32_64(divu2):
2377        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2378        break;
2379
2380    OP_32_64(shl):
2381        /* For small constant 3-operand shift, use LEA.  */
2382        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2383            if (a2 - 1 == 0) {
2384                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2385                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2386            } else {
2387                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2388                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2389            }
2390            break;
2391        }
2392        c = SHIFT_SHL;
2393        vexop = OPC_SHLX;
2394        goto gen_shift_maybe_vex;
2395    OP_32_64(shr):
2396        c = SHIFT_SHR;
2397        vexop = OPC_SHRX;
2398        goto gen_shift_maybe_vex;
2399    OP_32_64(sar):
2400        c = SHIFT_SAR;
2401        vexop = OPC_SARX;
2402        goto gen_shift_maybe_vex;
2403    OP_32_64(rotl):
2404        c = SHIFT_ROL;
2405        goto gen_shift;
2406    OP_32_64(rotr):
2407        c = SHIFT_ROR;
2408        goto gen_shift;
2409    gen_shift_maybe_vex:
2410        if (have_bmi2) {
2411            if (!const_a2) {
2412                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2413                break;
2414            }
2415            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2416        }
2417        /* FALLTHRU */
2418    gen_shift:
2419        if (const_a2) {
2420            tcg_out_shifti(s, c + rexw, a0, a2);
2421        } else {
2422            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2423        }
2424        break;
2425
2426    OP_32_64(ctz):
2427        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2428        break;
2429    OP_32_64(clz):
2430        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2431        break;
2432    OP_32_64(ctpop):
2433        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2434        break;
2435
2436    case INDEX_op_brcond_i32:
2437        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2438        break;
2439    case INDEX_op_setcond_i32:
2440        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2441        break;
2442    case INDEX_op_movcond_i32:
2443        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2444        break;
2445
2446    OP_32_64(bswap16):
2447        if (a2 & TCG_BSWAP_OS) {
2448            /* Output must be sign-extended. */
2449            if (rexw) {
2450                tcg_out_bswap64(s, a0);
2451                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2452            } else {
2453                tcg_out_bswap32(s, a0);
2454                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2455            }
2456        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2457            /* Output must be zero-extended, but input isn't. */
2458            tcg_out_bswap32(s, a0);
2459            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2460        } else {
2461            tcg_out_rolw_8(s, a0);
2462        }
2463        break;
2464    OP_32_64(bswap32):
2465        tcg_out_bswap32(s, a0);
2466        if (rexw && (a2 & TCG_BSWAP_OS)) {
2467            tcg_out_ext32s(s, a0, a0);
2468        }
2469        break;
2470
2471    OP_32_64(neg):
2472        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2473        break;
2474    OP_32_64(not):
2475        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2476        break;
2477
2478    case INDEX_op_qemu_ld_a64_i32:
2479        if (TCG_TARGET_REG_BITS == 32) {
2480            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2481            break;
2482        }
2483        /* fall through */
2484    case INDEX_op_qemu_ld_a32_i32:
2485        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2486        break;
2487    case INDEX_op_qemu_ld_a32_i64:
2488        if (TCG_TARGET_REG_BITS == 64) {
2489            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2490        } else {
2491            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2492        }
2493        break;
2494    case INDEX_op_qemu_ld_a64_i64:
2495        if (TCG_TARGET_REG_BITS == 64) {
2496            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2497        } else {
2498            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2499        }
2500        break;
2501
2502    case INDEX_op_qemu_st_a64_i32:
2503    case INDEX_op_qemu_st8_a64_i32:
2504        if (TCG_TARGET_REG_BITS == 32) {
2505            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2506            break;
2507        }
2508        /* fall through */
2509    case INDEX_op_qemu_st_a32_i32:
2510    case INDEX_op_qemu_st8_a32_i32:
2511        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2512        break;
2513    case INDEX_op_qemu_st_a32_i64:
2514        if (TCG_TARGET_REG_BITS == 64) {
2515            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2516        } else {
2517            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2518        }
2519        break;
2520    case INDEX_op_qemu_st_a64_i64:
2521        if (TCG_TARGET_REG_BITS == 64) {
2522            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2523        } else {
2524            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2525        }
2526        break;
2527
2528    OP_32_64(mulu2):
2529        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2530        break;
2531    OP_32_64(muls2):
2532        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2533        break;
2534    OP_32_64(add2):
2535        if (const_args[4]) {
2536            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2537        } else {
2538            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2539        }
2540        if (const_args[5]) {
2541            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2542        } else {
2543            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2544        }
2545        break;
2546    OP_32_64(sub2):
2547        if (const_args[4]) {
2548            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2549        } else {
2550            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2551        }
2552        if (const_args[5]) {
2553            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2554        } else {
2555            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2556        }
2557        break;
2558
2559#if TCG_TARGET_REG_BITS == 32
2560    case INDEX_op_brcond2_i32:
2561        tcg_out_brcond2(s, args, const_args, 0);
2562        break;
2563    case INDEX_op_setcond2_i32:
2564        tcg_out_setcond2(s, args, const_args);
2565        break;
2566#else /* TCG_TARGET_REG_BITS == 64 */
2567    case INDEX_op_ld32s_i64:
2568        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2569        break;
2570    case INDEX_op_ld_i64:
2571        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2572        break;
2573    case INDEX_op_st_i64:
2574        if (const_args[0]) {
2575            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2576            tcg_out32(s, a0);
2577        } else {
2578            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2579        }
2580        break;
2581
2582    case INDEX_op_brcond_i64:
2583        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2584        break;
2585    case INDEX_op_setcond_i64:
2586        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2587        break;
2588    case INDEX_op_movcond_i64:
2589        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2590        break;
2591
2592    case INDEX_op_bswap64_i64:
2593        tcg_out_bswap64(s, a0);
2594        break;
2595    case INDEX_op_extrh_i64_i32:
2596        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2597        break;
2598#endif
2599
2600    OP_32_64(deposit):
2601        if (args[3] == 0 && args[4] == 8) {
2602            /* load bits 0..7 */
2603            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2604        } else if (args[3] == 8 && args[4] == 8) {
2605            /* load bits 8..15 */
2606            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2607        } else if (args[3] == 0 && args[4] == 16) {
2608            /* load bits 0..15 */
2609            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2610        } else {
2611            g_assert_not_reached();
2612        }
2613        break;
2614
2615    case INDEX_op_extract_i64:
2616        if (a2 + args[3] == 32) {
2617            /* This is a 32-bit zero-extending right shift.  */
2618            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2619            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2620            break;
2621        }
2622        /* FALLTHRU */
2623    case INDEX_op_extract_i32:
2624        /* On the off-chance that we can use the high-byte registers.
2625           Otherwise we emit the same ext16 + shift pattern that we
2626           would have gotten from the normal tcg-op.c expansion.  */
2627        tcg_debug_assert(a2 == 8 && args[3] == 8);
2628        if (a1 < 4 && a0 < 8) {
2629            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2630        } else {
2631            tcg_out_ext16u(s, a0, a1);
2632            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2633        }
2634        break;
2635
2636    case INDEX_op_sextract_i32:
2637        /* We don't implement sextract_i64, as we cannot sign-extend to
2638           64-bits without using the REX prefix that explicitly excludes
2639           access to the high-byte registers.  */
2640        tcg_debug_assert(a2 == 8 && args[3] == 8);
2641        if (a1 < 4 && a0 < 8) {
2642            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2643        } else {
2644            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2645            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2646        }
2647        break;
2648
2649    OP_32_64(extract2):
2650        /* Note that SHRD outputs to the r/m operand.  */
2651        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2652        tcg_out8(s, args[3]);
2653        break;
2654
2655    case INDEX_op_mb:
2656        tcg_out_mb(s, a0);
2657        break;
2658    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2659    case INDEX_op_mov_i64:
2660    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2661    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2662    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2663    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2664    case INDEX_op_ext8s_i64:
2665    case INDEX_op_ext8u_i32:
2666    case INDEX_op_ext8u_i64:
2667    case INDEX_op_ext16s_i32:
2668    case INDEX_op_ext16s_i64:
2669    case INDEX_op_ext16u_i32:
2670    case INDEX_op_ext16u_i64:
2671    case INDEX_op_ext32s_i64:
2672    case INDEX_op_ext32u_i64:
2673    case INDEX_op_ext_i32_i64:
2674    case INDEX_op_extu_i32_i64:
2675    case INDEX_op_extrl_i64_i32:
2676    default:
2677        g_assert_not_reached();
2678    }
2679
2680#undef OP_32_64
2681}
2682
2683static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2684                           unsigned vecl, unsigned vece,
2685                           const TCGArg args[TCG_MAX_OP_ARGS],
2686                           const int const_args[TCG_MAX_OP_ARGS])
2687{
2688    static int const add_insn[4] = {
2689        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2690    };
2691    static int const ssadd_insn[4] = {
2692        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2693    };
2694    static int const usadd_insn[4] = {
2695        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2696    };
2697    static int const sub_insn[4] = {
2698        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2699    };
2700    static int const sssub_insn[4] = {
2701        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2702    };
2703    static int const ussub_insn[4] = {
2704        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2705    };
2706    static int const mul_insn[4] = {
2707        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2708    };
2709    static int const shift_imm_insn[4] = {
2710        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2711    };
2712    static int const cmpeq_insn[4] = {
2713        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2714    };
2715    static int const cmpgt_insn[4] = {
2716        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2717    };
2718    static int const punpckl_insn[4] = {
2719        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2720    };
2721    static int const punpckh_insn[4] = {
2722        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2723    };
2724    static int const packss_insn[4] = {
2725        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2726    };
2727    static int const packus_insn[4] = {
2728        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2729    };
2730    static int const smin_insn[4] = {
2731        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2732    };
2733    static int const smax_insn[4] = {
2734        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2735    };
2736    static int const umin_insn[4] = {
2737        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2738    };
2739    static int const umax_insn[4] = {
2740        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2741    };
2742    static int const rotlv_insn[4] = {
2743        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2744    };
2745    static int const rotrv_insn[4] = {
2746        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2747    };
2748    static int const shlv_insn[4] = {
2749        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2750    };
2751    static int const shrv_insn[4] = {
2752        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2753    };
2754    static int const sarv_insn[4] = {
2755        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2756    };
2757    static int const shls_insn[4] = {
2758        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2759    };
2760    static int const shrs_insn[4] = {
2761        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2762    };
2763    static int const sars_insn[4] = {
2764        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2765    };
2766    static int const vpshldi_insn[4] = {
2767        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2768    };
2769    static int const vpshldv_insn[4] = {
2770        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2771    };
2772    static int const vpshrdv_insn[4] = {
2773        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2774    };
2775    static int const abs_insn[4] = {
2776        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2777    };
2778
2779    TCGType type = vecl + TCG_TYPE_V64;
2780    int insn, sub;
2781    TCGArg a0, a1, a2, a3;
2782
2783    a0 = args[0];
2784    a1 = args[1];
2785    a2 = args[2];
2786
2787    switch (opc) {
2788    case INDEX_op_add_vec:
2789        insn = add_insn[vece];
2790        goto gen_simd;
2791    case INDEX_op_ssadd_vec:
2792        insn = ssadd_insn[vece];
2793        goto gen_simd;
2794    case INDEX_op_usadd_vec:
2795        insn = usadd_insn[vece];
2796        goto gen_simd;
2797    case INDEX_op_sub_vec:
2798        insn = sub_insn[vece];
2799        goto gen_simd;
2800    case INDEX_op_sssub_vec:
2801        insn = sssub_insn[vece];
2802        goto gen_simd;
2803    case INDEX_op_ussub_vec:
2804        insn = ussub_insn[vece];
2805        goto gen_simd;
2806    case INDEX_op_mul_vec:
2807        insn = mul_insn[vece];
2808        goto gen_simd;
2809    case INDEX_op_and_vec:
2810        insn = OPC_PAND;
2811        goto gen_simd;
2812    case INDEX_op_or_vec:
2813        insn = OPC_POR;
2814        goto gen_simd;
2815    case INDEX_op_xor_vec:
2816        insn = OPC_PXOR;
2817        goto gen_simd;
2818    case INDEX_op_smin_vec:
2819        insn = smin_insn[vece];
2820        goto gen_simd;
2821    case INDEX_op_umin_vec:
2822        insn = umin_insn[vece];
2823        goto gen_simd;
2824    case INDEX_op_smax_vec:
2825        insn = smax_insn[vece];
2826        goto gen_simd;
2827    case INDEX_op_umax_vec:
2828        insn = umax_insn[vece];
2829        goto gen_simd;
2830    case INDEX_op_shlv_vec:
2831        insn = shlv_insn[vece];
2832        goto gen_simd;
2833    case INDEX_op_shrv_vec:
2834        insn = shrv_insn[vece];
2835        goto gen_simd;
2836    case INDEX_op_sarv_vec:
2837        insn = sarv_insn[vece];
2838        goto gen_simd;
2839    case INDEX_op_rotlv_vec:
2840        insn = rotlv_insn[vece];
2841        goto gen_simd;
2842    case INDEX_op_rotrv_vec:
2843        insn = rotrv_insn[vece];
2844        goto gen_simd;
2845    case INDEX_op_shls_vec:
2846        insn = shls_insn[vece];
2847        goto gen_simd;
2848    case INDEX_op_shrs_vec:
2849        insn = shrs_insn[vece];
2850        goto gen_simd;
2851    case INDEX_op_sars_vec:
2852        insn = sars_insn[vece];
2853        goto gen_simd;
2854    case INDEX_op_x86_punpckl_vec:
2855        insn = punpckl_insn[vece];
2856        goto gen_simd;
2857    case INDEX_op_x86_punpckh_vec:
2858        insn = punpckh_insn[vece];
2859        goto gen_simd;
2860    case INDEX_op_x86_packss_vec:
2861        insn = packss_insn[vece];
2862        goto gen_simd;
2863    case INDEX_op_x86_packus_vec:
2864        insn = packus_insn[vece];
2865        goto gen_simd;
2866    case INDEX_op_x86_vpshldv_vec:
2867        insn = vpshldv_insn[vece];
2868        a1 = a2;
2869        a2 = args[3];
2870        goto gen_simd;
2871    case INDEX_op_x86_vpshrdv_vec:
2872        insn = vpshrdv_insn[vece];
2873        a1 = a2;
2874        a2 = args[3];
2875        goto gen_simd;
2876#if TCG_TARGET_REG_BITS == 32
2877    case INDEX_op_dup2_vec:
2878        /* First merge the two 32-bit inputs to a single 64-bit element. */
2879        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2880        /* Then replicate the 64-bit elements across the rest of the vector. */
2881        if (type != TCG_TYPE_V64) {
2882            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2883        }
2884        break;
2885#endif
2886    case INDEX_op_abs_vec:
2887        insn = abs_insn[vece];
2888        a2 = a1;
2889        a1 = 0;
2890        goto gen_simd;
2891    gen_simd:
2892        tcg_debug_assert(insn != OPC_UD2);
2893        if (type == TCG_TYPE_V256) {
2894            insn |= P_VEXL;
2895        }
2896        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2897        break;
2898
2899    case INDEX_op_cmp_vec:
2900        sub = args[3];
2901        if (sub == TCG_COND_EQ) {
2902            insn = cmpeq_insn[vece];
2903        } else if (sub == TCG_COND_GT) {
2904            insn = cmpgt_insn[vece];
2905        } else {
2906            g_assert_not_reached();
2907        }
2908        goto gen_simd;
2909
2910    case INDEX_op_andc_vec:
2911        insn = OPC_PANDN;
2912        if (type == TCG_TYPE_V256) {
2913            insn |= P_VEXL;
2914        }
2915        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2916        break;
2917
2918    case INDEX_op_shli_vec:
2919        insn = shift_imm_insn[vece];
2920        sub = 6;
2921        goto gen_shift;
2922    case INDEX_op_shri_vec:
2923        insn = shift_imm_insn[vece];
2924        sub = 2;
2925        goto gen_shift;
2926    case INDEX_op_sari_vec:
2927        if (vece == MO_64) {
2928            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
2929        } else {
2930            insn = shift_imm_insn[vece];
2931        }
2932        sub = 4;
2933        goto gen_shift;
2934    case INDEX_op_rotli_vec:
2935        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
2936        if (vece == MO_64) {
2937            insn |= P_VEXW;
2938        }
2939        sub = 1;
2940        goto gen_shift;
2941    gen_shift:
2942        tcg_debug_assert(vece != MO_8);
2943        if (type == TCG_TYPE_V256) {
2944            insn |= P_VEXL;
2945        }
2946        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2947        tcg_out8(s, a2);
2948        break;
2949
2950    case INDEX_op_ld_vec:
2951        tcg_out_ld(s, type, a0, a1, a2);
2952        break;
2953    case INDEX_op_st_vec:
2954        tcg_out_st(s, type, a0, a1, a2);
2955        break;
2956    case INDEX_op_dupm_vec:
2957        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2958        break;
2959
2960    case INDEX_op_x86_shufps_vec:
2961        insn = OPC_SHUFPS;
2962        sub = args[3];
2963        goto gen_simd_imm8;
2964    case INDEX_op_x86_blend_vec:
2965        if (vece == MO_16) {
2966            insn = OPC_PBLENDW;
2967        } else if (vece == MO_32) {
2968            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2969        } else {
2970            g_assert_not_reached();
2971        }
2972        sub = args[3];
2973        goto gen_simd_imm8;
2974    case INDEX_op_x86_vperm2i128_vec:
2975        insn = OPC_VPERM2I128;
2976        sub = args[3];
2977        goto gen_simd_imm8;
2978    case INDEX_op_x86_vpshldi_vec:
2979        insn = vpshldi_insn[vece];
2980        sub = args[3];
2981        goto gen_simd_imm8;
2982
2983    case INDEX_op_not_vec:
2984        insn = OPC_VPTERNLOGQ;
2985        a2 = a1;
2986        sub = 0x33; /* !B */
2987        goto gen_simd_imm8;
2988    case INDEX_op_nor_vec:
2989        insn = OPC_VPTERNLOGQ;
2990        sub = 0x11; /* norCB */
2991        goto gen_simd_imm8;
2992    case INDEX_op_nand_vec:
2993        insn = OPC_VPTERNLOGQ;
2994        sub = 0x77; /* nandCB */
2995        goto gen_simd_imm8;
2996    case INDEX_op_eqv_vec:
2997        insn = OPC_VPTERNLOGQ;
2998        sub = 0x99; /* xnorCB */
2999        goto gen_simd_imm8;
3000    case INDEX_op_orc_vec:
3001        insn = OPC_VPTERNLOGQ;
3002        sub = 0xdd; /* orB!C */
3003        goto gen_simd_imm8;
3004
3005    case INDEX_op_bitsel_vec:
3006        insn = OPC_VPTERNLOGQ;
3007        a3 = args[3];
3008        if (a0 == a1) {
3009            a1 = a2;
3010            a2 = a3;
3011            sub = 0xca; /* A?B:C */
3012        } else if (a0 == a2) {
3013            a2 = a3;
3014            sub = 0xe2; /* B?A:C */
3015        } else {
3016            tcg_out_mov(s, type, a0, a3);
3017            sub = 0xb8; /* B?C:A */
3018        }
3019        goto gen_simd_imm8;
3020
3021    gen_simd_imm8:
3022        tcg_debug_assert(insn != OPC_UD2);
3023        if (type == TCG_TYPE_V256) {
3024            insn |= P_VEXL;
3025        }
3026        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3027        tcg_out8(s, sub);
3028        break;
3029
3030    case INDEX_op_x86_vpblendvb_vec:
3031        insn = OPC_VPBLENDVB;
3032        if (type == TCG_TYPE_V256) {
3033            insn |= P_VEXL;
3034        }
3035        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3036        tcg_out8(s, args[3] << 4);
3037        break;
3038
3039    case INDEX_op_x86_psrldq_vec:
3040        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3041        tcg_out8(s, a2);
3042        break;
3043
3044    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3045    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3046    default:
3047        g_assert_not_reached();
3048    }
3049}
3050
3051static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3052{
3053    switch (op) {
3054    case INDEX_op_goto_ptr:
3055        return C_O0_I1(r);
3056
3057    case INDEX_op_ld8u_i32:
3058    case INDEX_op_ld8u_i64:
3059    case INDEX_op_ld8s_i32:
3060    case INDEX_op_ld8s_i64:
3061    case INDEX_op_ld16u_i32:
3062    case INDEX_op_ld16u_i64:
3063    case INDEX_op_ld16s_i32:
3064    case INDEX_op_ld16s_i64:
3065    case INDEX_op_ld_i32:
3066    case INDEX_op_ld32u_i64:
3067    case INDEX_op_ld32s_i64:
3068    case INDEX_op_ld_i64:
3069        return C_O1_I1(r, r);
3070
3071    case INDEX_op_st8_i32:
3072    case INDEX_op_st8_i64:
3073        return C_O0_I2(qi, r);
3074
3075    case INDEX_op_st16_i32:
3076    case INDEX_op_st16_i64:
3077    case INDEX_op_st_i32:
3078    case INDEX_op_st32_i64:
3079        return C_O0_I2(ri, r);
3080
3081    case INDEX_op_st_i64:
3082        return C_O0_I2(re, r);
3083
3084    case INDEX_op_add_i32:
3085    case INDEX_op_add_i64:
3086        return C_O1_I2(r, r, re);
3087
3088    case INDEX_op_sub_i32:
3089    case INDEX_op_sub_i64:
3090    case INDEX_op_mul_i32:
3091    case INDEX_op_mul_i64:
3092    case INDEX_op_or_i32:
3093    case INDEX_op_or_i64:
3094    case INDEX_op_xor_i32:
3095    case INDEX_op_xor_i64:
3096        return C_O1_I2(r, 0, re);
3097
3098    case INDEX_op_and_i32:
3099    case INDEX_op_and_i64:
3100        return C_O1_I2(r, 0, reZ);
3101
3102    case INDEX_op_andc_i32:
3103    case INDEX_op_andc_i64:
3104        return C_O1_I2(r, r, rI);
3105
3106    case INDEX_op_shl_i32:
3107    case INDEX_op_shl_i64:
3108    case INDEX_op_shr_i32:
3109    case INDEX_op_shr_i64:
3110    case INDEX_op_sar_i32:
3111    case INDEX_op_sar_i64:
3112        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3113
3114    case INDEX_op_rotl_i32:
3115    case INDEX_op_rotl_i64:
3116    case INDEX_op_rotr_i32:
3117    case INDEX_op_rotr_i64:
3118        return C_O1_I2(r, 0, ci);
3119
3120    case INDEX_op_brcond_i32:
3121    case INDEX_op_brcond_i64:
3122        return C_O0_I2(r, re);
3123
3124    case INDEX_op_bswap16_i32:
3125    case INDEX_op_bswap16_i64:
3126    case INDEX_op_bswap32_i32:
3127    case INDEX_op_bswap32_i64:
3128    case INDEX_op_bswap64_i64:
3129    case INDEX_op_neg_i32:
3130    case INDEX_op_neg_i64:
3131    case INDEX_op_not_i32:
3132    case INDEX_op_not_i64:
3133    case INDEX_op_extrh_i64_i32:
3134        return C_O1_I1(r, 0);
3135
3136    case INDEX_op_ext8s_i32:
3137    case INDEX_op_ext8s_i64:
3138    case INDEX_op_ext8u_i32:
3139    case INDEX_op_ext8u_i64:
3140        return C_O1_I1(r, q);
3141
3142    case INDEX_op_ext16s_i32:
3143    case INDEX_op_ext16s_i64:
3144    case INDEX_op_ext16u_i32:
3145    case INDEX_op_ext16u_i64:
3146    case INDEX_op_ext32s_i64:
3147    case INDEX_op_ext32u_i64:
3148    case INDEX_op_ext_i32_i64:
3149    case INDEX_op_extu_i32_i64:
3150    case INDEX_op_extrl_i64_i32:
3151    case INDEX_op_extract_i32:
3152    case INDEX_op_extract_i64:
3153    case INDEX_op_sextract_i32:
3154    case INDEX_op_ctpop_i32:
3155    case INDEX_op_ctpop_i64:
3156        return C_O1_I1(r, r);
3157
3158    case INDEX_op_extract2_i32:
3159    case INDEX_op_extract2_i64:
3160        return C_O1_I2(r, 0, r);
3161
3162    case INDEX_op_deposit_i32:
3163    case INDEX_op_deposit_i64:
3164        return C_O1_I2(Q, 0, Q);
3165
3166    case INDEX_op_setcond_i32:
3167    case INDEX_op_setcond_i64:
3168        return C_O1_I2(q, r, re);
3169
3170    case INDEX_op_movcond_i32:
3171    case INDEX_op_movcond_i64:
3172        return C_O1_I4(r, r, re, r, 0);
3173
3174    case INDEX_op_div2_i32:
3175    case INDEX_op_div2_i64:
3176    case INDEX_op_divu2_i32:
3177    case INDEX_op_divu2_i64:
3178        return C_O2_I3(a, d, 0, 1, r);
3179
3180    case INDEX_op_mulu2_i32:
3181    case INDEX_op_mulu2_i64:
3182    case INDEX_op_muls2_i32:
3183    case INDEX_op_muls2_i64:
3184        return C_O2_I2(a, d, a, r);
3185
3186    case INDEX_op_add2_i32:
3187    case INDEX_op_add2_i64:
3188    case INDEX_op_sub2_i32:
3189    case INDEX_op_sub2_i64:
3190        return C_O2_I4(r, r, 0, 1, re, re);
3191
3192    case INDEX_op_ctz_i32:
3193    case INDEX_op_ctz_i64:
3194        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3195
3196    case INDEX_op_clz_i32:
3197    case INDEX_op_clz_i64:
3198        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3199
3200    case INDEX_op_qemu_ld_a32_i32:
3201        return C_O1_I1(r, L);
3202    case INDEX_op_qemu_ld_a64_i32:
3203        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3204
3205    case INDEX_op_qemu_st_a32_i32:
3206        return C_O0_I2(L, L);
3207    case INDEX_op_qemu_st_a64_i32:
3208        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3209    case INDEX_op_qemu_st8_a32_i32:
3210        return C_O0_I2(s, L);
3211    case INDEX_op_qemu_st8_a64_i32:
3212        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3213
3214    case INDEX_op_qemu_ld_a32_i64:
3215        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3216    case INDEX_op_qemu_ld_a64_i64:
3217        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3218
3219    case INDEX_op_qemu_st_a32_i64:
3220        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3221    case INDEX_op_qemu_st_a64_i64:
3222        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3223
3224    case INDEX_op_brcond2_i32:
3225        return C_O0_I4(r, r, ri, ri);
3226
3227    case INDEX_op_setcond2_i32:
3228        return C_O1_I4(r, r, r, ri, ri);
3229
3230    case INDEX_op_ld_vec:
3231    case INDEX_op_dupm_vec:
3232        return C_O1_I1(x, r);
3233
3234    case INDEX_op_st_vec:
3235        return C_O0_I2(x, r);
3236
3237    case INDEX_op_add_vec:
3238    case INDEX_op_sub_vec:
3239    case INDEX_op_mul_vec:
3240    case INDEX_op_and_vec:
3241    case INDEX_op_or_vec:
3242    case INDEX_op_xor_vec:
3243    case INDEX_op_andc_vec:
3244    case INDEX_op_orc_vec:
3245    case INDEX_op_nand_vec:
3246    case INDEX_op_nor_vec:
3247    case INDEX_op_eqv_vec:
3248    case INDEX_op_ssadd_vec:
3249    case INDEX_op_usadd_vec:
3250    case INDEX_op_sssub_vec:
3251    case INDEX_op_ussub_vec:
3252    case INDEX_op_smin_vec:
3253    case INDEX_op_umin_vec:
3254    case INDEX_op_smax_vec:
3255    case INDEX_op_umax_vec:
3256    case INDEX_op_shlv_vec:
3257    case INDEX_op_shrv_vec:
3258    case INDEX_op_sarv_vec:
3259    case INDEX_op_rotlv_vec:
3260    case INDEX_op_rotrv_vec:
3261    case INDEX_op_shls_vec:
3262    case INDEX_op_shrs_vec:
3263    case INDEX_op_sars_vec:
3264    case INDEX_op_cmp_vec:
3265    case INDEX_op_x86_shufps_vec:
3266    case INDEX_op_x86_blend_vec:
3267    case INDEX_op_x86_packss_vec:
3268    case INDEX_op_x86_packus_vec:
3269    case INDEX_op_x86_vperm2i128_vec:
3270    case INDEX_op_x86_punpckl_vec:
3271    case INDEX_op_x86_punpckh_vec:
3272    case INDEX_op_x86_vpshldi_vec:
3273#if TCG_TARGET_REG_BITS == 32
3274    case INDEX_op_dup2_vec:
3275#endif
3276        return C_O1_I2(x, x, x);
3277
3278    case INDEX_op_abs_vec:
3279    case INDEX_op_dup_vec:
3280    case INDEX_op_not_vec:
3281    case INDEX_op_shli_vec:
3282    case INDEX_op_shri_vec:
3283    case INDEX_op_sari_vec:
3284    case INDEX_op_rotli_vec:
3285    case INDEX_op_x86_psrldq_vec:
3286        return C_O1_I1(x, x);
3287
3288    case INDEX_op_x86_vpshldv_vec:
3289    case INDEX_op_x86_vpshrdv_vec:
3290        return C_O1_I3(x, 0, x, x);
3291
3292    case INDEX_op_bitsel_vec:
3293    case INDEX_op_x86_vpblendvb_vec:
3294        return C_O1_I3(x, x, x, x);
3295
3296    default:
3297        g_assert_not_reached();
3298    }
3299}
3300
3301int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3302{
3303    switch (opc) {
3304    case INDEX_op_add_vec:
3305    case INDEX_op_sub_vec:
3306    case INDEX_op_and_vec:
3307    case INDEX_op_or_vec:
3308    case INDEX_op_xor_vec:
3309    case INDEX_op_andc_vec:
3310    case INDEX_op_orc_vec:
3311    case INDEX_op_nand_vec:
3312    case INDEX_op_nor_vec:
3313    case INDEX_op_eqv_vec:
3314    case INDEX_op_not_vec:
3315    case INDEX_op_bitsel_vec:
3316        return 1;
3317    case INDEX_op_cmp_vec:
3318    case INDEX_op_cmpsel_vec:
3319        return -1;
3320
3321    case INDEX_op_rotli_vec:
3322        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3323
3324    case INDEX_op_shli_vec:
3325    case INDEX_op_shri_vec:
3326        /* We must expand the operation for MO_8.  */
3327        return vece == MO_8 ? -1 : 1;
3328
3329    case INDEX_op_sari_vec:
3330        switch (vece) {
3331        case MO_8:
3332            return -1;
3333        case MO_16:
3334        case MO_32:
3335            return 1;
3336        case MO_64:
3337            if (have_avx512vl) {
3338                return 1;
3339            }
3340            /*
3341             * We can emulate this for MO_64, but it does not pay off
3342             * unless we're producing at least 4 values.
3343             */
3344            return type >= TCG_TYPE_V256 ? -1 : 0;
3345        }
3346        return 0;
3347
3348    case INDEX_op_shls_vec:
3349    case INDEX_op_shrs_vec:
3350        return vece >= MO_16;
3351    case INDEX_op_sars_vec:
3352        switch (vece) {
3353        case MO_16:
3354        case MO_32:
3355            return 1;
3356        case MO_64:
3357            return have_avx512vl;
3358        }
3359        return 0;
3360    case INDEX_op_rotls_vec:
3361        return vece >= MO_16 ? -1 : 0;
3362
3363    case INDEX_op_shlv_vec:
3364    case INDEX_op_shrv_vec:
3365        switch (vece) {
3366        case MO_16:
3367            return have_avx512bw;
3368        case MO_32:
3369        case MO_64:
3370            return have_avx2;
3371        }
3372        return 0;
3373    case INDEX_op_sarv_vec:
3374        switch (vece) {
3375        case MO_16:
3376            return have_avx512bw;
3377        case MO_32:
3378            return have_avx2;
3379        case MO_64:
3380            return have_avx512vl;
3381        }
3382        return 0;
3383    case INDEX_op_rotlv_vec:
3384    case INDEX_op_rotrv_vec:
3385        switch (vece) {
3386        case MO_16:
3387            return have_avx512vbmi2 ? -1 : 0;
3388        case MO_32:
3389        case MO_64:
3390            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3391        }
3392        return 0;
3393
3394    case INDEX_op_mul_vec:
3395        switch (vece) {
3396        case MO_8:
3397            return -1;
3398        case MO_64:
3399            return have_avx512dq;
3400        }
3401        return 1;
3402
3403    case INDEX_op_ssadd_vec:
3404    case INDEX_op_usadd_vec:
3405    case INDEX_op_sssub_vec:
3406    case INDEX_op_ussub_vec:
3407        return vece <= MO_16;
3408    case INDEX_op_smin_vec:
3409    case INDEX_op_smax_vec:
3410    case INDEX_op_umin_vec:
3411    case INDEX_op_umax_vec:
3412    case INDEX_op_abs_vec:
3413        return vece <= MO_32 || have_avx512vl;
3414
3415    default:
3416        return 0;
3417    }
3418}
3419
3420static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3421                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3422{
3423    TCGv_vec t1, t2;
3424
3425    tcg_debug_assert(vece == MO_8);
3426
3427    t1 = tcg_temp_new_vec(type);
3428    t2 = tcg_temp_new_vec(type);
3429
3430    /*
3431     * Unpack to W, shift, and repack.  Tricky bits:
3432     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3433     *     i.e. duplicate in other half of the 16-bit lane.
3434     * (2) For right-shift, add 8 so that the high half of the lane
3435     *     becomes zero.  For left-shift, and left-rotate, we must
3436     *     shift up and down again.
3437     * (3) Step 2 leaves high half zero such that PACKUSWB
3438     *     (pack with unsigned saturation) does not modify
3439     *     the quantity.
3440     */
3441    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3442              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3443    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3444              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3445
3446    if (opc != INDEX_op_rotli_vec) {
3447        imm += 8;
3448    }
3449    if (opc == INDEX_op_shri_vec) {
3450        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3451        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3452    } else {
3453        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3454        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3455        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3456        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3457    }
3458
3459    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3460              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3461    tcg_temp_free_vec(t1);
3462    tcg_temp_free_vec(t2);
3463}
3464
3465static void expand_vec_sari(TCGType type, unsigned vece,
3466                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3467{
3468    TCGv_vec t1, t2;
3469
3470    switch (vece) {
3471    case MO_8:
3472        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3473        t1 = tcg_temp_new_vec(type);
3474        t2 = tcg_temp_new_vec(type);
3475        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3476                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3477        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3478                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3479        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3480        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3481        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3482                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3483        tcg_temp_free_vec(t1);
3484        tcg_temp_free_vec(t2);
3485        break;
3486
3487    case MO_64:
3488        t1 = tcg_temp_new_vec(type);
3489        if (imm <= 32) {
3490            /*
3491             * We can emulate a small sign extend by performing an arithmetic
3492             * 32-bit shift and overwriting the high half of a 64-bit logical
3493             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3494             * does not, so we have to bound the smaller shift -- we get the
3495             * same result in the high half either way.
3496             */
3497            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3498            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3499            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3500                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3501                      tcgv_vec_arg(t1), 0xaa);
3502        } else {
3503            /* Otherwise we will need to use a compare vs 0 to produce
3504             * the sign-extend, shift and merge.
3505             */
3506            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3507                            tcg_constant_vec(type, MO_64, 0), v1);
3508            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3509            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3510            tcg_gen_or_vec(MO_64, v0, v0, t1);
3511        }
3512        tcg_temp_free_vec(t1);
3513        break;
3514
3515    default:
3516        g_assert_not_reached();
3517    }
3518}
3519
3520static void expand_vec_rotli(TCGType type, unsigned vece,
3521                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3522{
3523    TCGv_vec t;
3524
3525    if (vece == MO_8) {
3526        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3527        return;
3528    }
3529
3530    if (have_avx512vbmi2) {
3531        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3532                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3533        return;
3534    }
3535
3536    t = tcg_temp_new_vec(type);
3537    tcg_gen_shli_vec(vece, t, v1, imm);
3538    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3539    tcg_gen_or_vec(vece, v0, v0, t);
3540    tcg_temp_free_vec(t);
3541}
3542
3543static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3544                            TCGv_vec v1, TCGv_vec sh, bool right)
3545{
3546    TCGv_vec t;
3547
3548    if (have_avx512vbmi2) {
3549        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3550                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3551                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3552        return;
3553    }
3554
3555    t = tcg_temp_new_vec(type);
3556    tcg_gen_dupi_vec(vece, t, 8 << vece);
3557    tcg_gen_sub_vec(vece, t, t, sh);
3558    if (right) {
3559        tcg_gen_shlv_vec(vece, t, v1, t);
3560        tcg_gen_shrv_vec(vece, v0, v1, sh);
3561    } else {
3562        tcg_gen_shrv_vec(vece, t, v1, t);
3563        tcg_gen_shlv_vec(vece, v0, v1, sh);
3564    }
3565    tcg_gen_or_vec(vece, v0, v0, t);
3566    tcg_temp_free_vec(t);
3567}
3568
3569static void expand_vec_rotls(TCGType type, unsigned vece,
3570                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3571{
3572    TCGv_vec t = tcg_temp_new_vec(type);
3573
3574    tcg_debug_assert(vece != MO_8);
3575
3576    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3577        tcg_gen_dup_i32_vec(vece, t, lsh);
3578        if (vece >= MO_32) {
3579            tcg_gen_rotlv_vec(vece, v0, v1, t);
3580        } else {
3581            expand_vec_rotv(type, vece, v0, v1, t, false);
3582        }
3583    } else {
3584        TCGv_i32 rsh = tcg_temp_new_i32();
3585
3586        tcg_gen_neg_i32(rsh, lsh);
3587        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3588        tcg_gen_shls_vec(vece, t, v1, lsh);
3589        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3590        tcg_gen_or_vec(vece, v0, v0, t);
3591
3592        tcg_temp_free_i32(rsh);
3593    }
3594
3595    tcg_temp_free_vec(t);
3596}
3597
3598static void expand_vec_mul(TCGType type, unsigned vece,
3599                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3600{
3601    TCGv_vec t1, t2, t3, t4, zero;
3602
3603    tcg_debug_assert(vece == MO_8);
3604
3605    /*
3606     * Unpack v1 bytes to words, 0 | x.
3607     * Unpack v2 bytes to words, y | 0.
3608     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3609     * Shift logical right by 8 bits to clear the high 8 bytes before
3610     * using an unsigned saturated pack.
3611     *
3612     * The difference between the V64, V128 and V256 cases is merely how
3613     * we distribute the expansion between temporaries.
3614     */
3615    switch (type) {
3616    case TCG_TYPE_V64:
3617        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3618        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3619        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3620        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3621                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3622        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3623                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3624        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3625        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3626        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3627                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3628        tcg_temp_free_vec(t1);
3629        tcg_temp_free_vec(t2);
3630        break;
3631
3632    case TCG_TYPE_V128:
3633    case TCG_TYPE_V256:
3634        t1 = tcg_temp_new_vec(type);
3635        t2 = tcg_temp_new_vec(type);
3636        t3 = tcg_temp_new_vec(type);
3637        t4 = tcg_temp_new_vec(type);
3638        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3639        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3640                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3641        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3642                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3643        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3644                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3645        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3646                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3647        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3648        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3649        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3650        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3651        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3652                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3653        tcg_temp_free_vec(t1);
3654        tcg_temp_free_vec(t2);
3655        tcg_temp_free_vec(t3);
3656        tcg_temp_free_vec(t4);
3657        break;
3658
3659    default:
3660        g_assert_not_reached();
3661    }
3662}
3663
3664static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3665                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3666{
3667    enum {
3668        NEED_INV  = 1,
3669        NEED_SWAP = 2,
3670        NEED_BIAS = 4,
3671        NEED_UMIN = 8,
3672        NEED_UMAX = 16,
3673    };
3674    TCGv_vec t1, t2, t3;
3675    uint8_t fixup;
3676
3677    switch (cond) {
3678    case TCG_COND_EQ:
3679    case TCG_COND_GT:
3680        fixup = 0;
3681        break;
3682    case TCG_COND_NE:
3683    case TCG_COND_LE:
3684        fixup = NEED_INV;
3685        break;
3686    case TCG_COND_LT:
3687        fixup = NEED_SWAP;
3688        break;
3689    case TCG_COND_GE:
3690        fixup = NEED_SWAP | NEED_INV;
3691        break;
3692    case TCG_COND_LEU:
3693        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3694            fixup = NEED_UMIN;
3695        } else {
3696            fixup = NEED_BIAS | NEED_INV;
3697        }
3698        break;
3699    case TCG_COND_GTU:
3700        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3701            fixup = NEED_UMIN | NEED_INV;
3702        } else {
3703            fixup = NEED_BIAS;
3704        }
3705        break;
3706    case TCG_COND_GEU:
3707        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3708            fixup = NEED_UMAX;
3709        } else {
3710            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3711        }
3712        break;
3713    case TCG_COND_LTU:
3714        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3715            fixup = NEED_UMAX | NEED_INV;
3716        } else {
3717            fixup = NEED_BIAS | NEED_SWAP;
3718        }
3719        break;
3720    default:
3721        g_assert_not_reached();
3722    }
3723
3724    if (fixup & NEED_INV) {
3725        cond = tcg_invert_cond(cond);
3726    }
3727    if (fixup & NEED_SWAP) {
3728        t1 = v1, v1 = v2, v2 = t1;
3729        cond = tcg_swap_cond(cond);
3730    }
3731
3732    t1 = t2 = NULL;
3733    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3734        t1 = tcg_temp_new_vec(type);
3735        if (fixup & NEED_UMIN) {
3736            tcg_gen_umin_vec(vece, t1, v1, v2);
3737        } else {
3738            tcg_gen_umax_vec(vece, t1, v1, v2);
3739        }
3740        v2 = t1;
3741        cond = TCG_COND_EQ;
3742    } else if (fixup & NEED_BIAS) {
3743        t1 = tcg_temp_new_vec(type);
3744        t2 = tcg_temp_new_vec(type);
3745        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3746        tcg_gen_sub_vec(vece, t1, v1, t3);
3747        tcg_gen_sub_vec(vece, t2, v2, t3);
3748        v1 = t1;
3749        v2 = t2;
3750        cond = tcg_signed_cond(cond);
3751    }
3752
3753    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3754    /* Expand directly; do not recurse.  */
3755    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3756              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3757
3758    if (t1) {
3759        tcg_temp_free_vec(t1);
3760        if (t2) {
3761            tcg_temp_free_vec(t2);
3762        }
3763    }
3764    return fixup & NEED_INV;
3765}
3766
3767static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3768                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3769{
3770    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3771        tcg_gen_not_vec(vece, v0, v0);
3772    }
3773}
3774
3775static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3776                              TCGv_vec c1, TCGv_vec c2,
3777                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3778{
3779    TCGv_vec t = tcg_temp_new_vec(type);
3780
3781    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3782        /* Invert the sense of the compare by swapping arguments.  */
3783        TCGv_vec x;
3784        x = v3, v3 = v4, v4 = x;
3785    }
3786    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3787              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3788              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3789    tcg_temp_free_vec(t);
3790}
3791
3792void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3793                       TCGArg a0, ...)
3794{
3795    va_list va;
3796    TCGArg a2;
3797    TCGv_vec v0, v1, v2, v3, v4;
3798
3799    va_start(va, a0);
3800    v0 = temp_tcgv_vec(arg_temp(a0));
3801    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3802    a2 = va_arg(va, TCGArg);
3803
3804    switch (opc) {
3805    case INDEX_op_shli_vec:
3806    case INDEX_op_shri_vec:
3807        expand_vec_shi(type, vece, opc, v0, v1, a2);
3808        break;
3809
3810    case INDEX_op_sari_vec:
3811        expand_vec_sari(type, vece, v0, v1, a2);
3812        break;
3813
3814    case INDEX_op_rotli_vec:
3815        expand_vec_rotli(type, vece, v0, v1, a2);
3816        break;
3817
3818    case INDEX_op_rotls_vec:
3819        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3820        break;
3821
3822    case INDEX_op_rotlv_vec:
3823        v2 = temp_tcgv_vec(arg_temp(a2));
3824        expand_vec_rotv(type, vece, v0, v1, v2, false);
3825        break;
3826    case INDEX_op_rotrv_vec:
3827        v2 = temp_tcgv_vec(arg_temp(a2));
3828        expand_vec_rotv(type, vece, v0, v1, v2, true);
3829        break;
3830
3831    case INDEX_op_mul_vec:
3832        v2 = temp_tcgv_vec(arg_temp(a2));
3833        expand_vec_mul(type, vece, v0, v1, v2);
3834        break;
3835
3836    case INDEX_op_cmp_vec:
3837        v2 = temp_tcgv_vec(arg_temp(a2));
3838        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3839        break;
3840
3841    case INDEX_op_cmpsel_vec:
3842        v2 = temp_tcgv_vec(arg_temp(a2));
3843        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3844        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3845        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3846        break;
3847
3848    default:
3849        break;
3850    }
3851
3852    va_end(va);
3853}
3854
3855static const int tcg_target_callee_save_regs[] = {
3856#if TCG_TARGET_REG_BITS == 64
3857    TCG_REG_RBP,
3858    TCG_REG_RBX,
3859#if defined(_WIN64)
3860    TCG_REG_RDI,
3861    TCG_REG_RSI,
3862#endif
3863    TCG_REG_R12,
3864    TCG_REG_R13,
3865    TCG_REG_R14, /* Currently used for the global env. */
3866    TCG_REG_R15,
3867#else
3868    TCG_REG_EBP, /* Currently used for the global env. */
3869    TCG_REG_EBX,
3870    TCG_REG_ESI,
3871    TCG_REG_EDI,
3872#endif
3873};
3874
3875/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3876   and tcg_register_jit.  */
3877
3878#define PUSH_SIZE \
3879    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3880     * (TCG_TARGET_REG_BITS / 8))
3881
3882#define FRAME_SIZE \
3883    ((PUSH_SIZE \
3884      + TCG_STATIC_CALL_ARGS_SIZE \
3885      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3886      + TCG_TARGET_STACK_ALIGN - 1) \
3887     & ~(TCG_TARGET_STACK_ALIGN - 1))
3888
3889/* Generate global QEMU prologue and epilogue code */
3890static void tcg_target_qemu_prologue(TCGContext *s)
3891{
3892    int i, stack_addend;
3893
3894    /* TB prologue */
3895
3896    /* Reserve some stack space, also for TCG temps.  */
3897    stack_addend = FRAME_SIZE - PUSH_SIZE;
3898    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3899                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3900
3901    /* Save all callee saved registers.  */
3902    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3903        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3904    }
3905
3906#if TCG_TARGET_REG_BITS == 32
3907    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3908               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3909    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3910    /* jmp *tb.  */
3911    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3912                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3913                         + stack_addend);
3914#else
3915# if !defined(CONFIG_SOFTMMU)
3916    if (guest_base) {
3917        int seg = setup_guest_base_seg();
3918        if (seg != 0) {
3919            x86_guest_base.seg = seg;
3920        } else if (guest_base == (int32_t)guest_base) {
3921            x86_guest_base.ofs = guest_base;
3922        } else {
3923            /* Choose R12 because, as a base, it requires a SIB byte. */
3924            x86_guest_base.index = TCG_REG_R12;
3925            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
3926            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
3927        }
3928    }
3929# endif
3930    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3931    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3932    /* jmp *tb.  */
3933    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3934#endif
3935
3936    /*
3937     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3938     * and fall through to the rest of the epilogue.
3939     */
3940    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3941    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3942
3943    /* TB epilogue */
3944    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3945
3946    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3947
3948    if (have_avx2) {
3949        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3950    }
3951    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3952        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3953    }
3954    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3955}
3956
3957static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3958{
3959    memset(p, 0x90, count);
3960}
3961
3962static void tcg_target_init(TCGContext *s)
3963{
3964#ifdef CONFIG_CPUID_H
3965    unsigned a, b, c, d, b7 = 0, c7 = 0;
3966    unsigned max = __get_cpuid_max(0, 0);
3967
3968    if (max >= 7) {
3969        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3970        __cpuid_count(7, 0, a, b7, c7, d);
3971        have_bmi1 = (b7 & bit_BMI) != 0;
3972        have_bmi2 = (b7 & bit_BMI2) != 0;
3973    }
3974
3975    if (max >= 1) {
3976        __cpuid(1, a, b, c, d);
3977#ifndef have_cmov
3978        /* For 32-bit, 99% certainty that we're running on hardware that
3979           supports cmov, but we still need to check.  In case cmov is not
3980           available, we'll use a small forward branch.  */
3981        have_cmov = (d & bit_CMOV) != 0;
3982#endif
3983
3984        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3985           need to probe for it.  */
3986        have_movbe = (c & bit_MOVBE) != 0;
3987        have_popcnt = (c & bit_POPCNT) != 0;
3988
3989        /* There are a number of things we must check before we can be
3990           sure of not hitting invalid opcode.  */
3991        if (c & bit_OSXSAVE) {
3992            unsigned bv = xgetbv_low(0);
3993
3994            if ((bv & 6) == 6) {
3995                have_avx1 = (c & bit_AVX) != 0;
3996                have_avx2 = (b7 & bit_AVX2) != 0;
3997
3998                /*
3999                 * There are interesting instructions in AVX512, so long
4000                 * as we have AVX512VL, which indicates support for EVEX
4001                 * on sizes smaller than 512 bits.  We are required to
4002                 * check that OPMASK and all extended ZMM state are enabled
4003                 * even if we're not using them -- the insns will fault.
4004                 */
4005                if ((bv & 0xe0) == 0xe0
4006                    && (b7 & bit_AVX512F)
4007                    && (b7 & bit_AVX512VL)) {
4008                    have_avx512vl = true;
4009                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
4010                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
4011                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
4012                }
4013
4014                /*
4015                 * The Intel SDM has added:
4016                 *   Processors that enumerate support for Intel® AVX
4017                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
4018                 *   guarantee that the 16-byte memory operations performed
4019                 *   by the following instructions will always be carried
4020                 *   out atomically:
4021                 *   - MOVAPD, MOVAPS, and MOVDQA.
4022                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
4023                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
4024                 *     with EVEX.128 and k0 (masking disabled).
4025                 * Note that these instructions require the linear addresses
4026                 * of their memory operands to be 16-byte aligned.
4027                 *
4028                 * AMD has provided an even stronger guarantee that processors
4029                 * with AVX provide 16-byte atomicity for all cachable,
4030                 * naturally aligned single loads and stores, e.g. MOVDQU.
4031                 *
4032                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
4033                 */
4034                if (have_avx1) {
4035                    __cpuid(0, a, b, c, d);
4036                    have_atomic16 = (c == signature_INTEL_ecx ||
4037                                     c == signature_AMD_ecx);
4038                }
4039            }
4040        }
4041    }
4042
4043    max = __get_cpuid_max(0x8000000, 0);
4044    if (max >= 1) {
4045        __cpuid(0x80000001, a, b, c, d);
4046        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4047        have_lzcnt = (c & bit_LZCNT) != 0;
4048    }
4049#endif /* CONFIG_CPUID_H */
4050
4051    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4052    if (TCG_TARGET_REG_BITS == 64) {
4053        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4054    }
4055    if (have_avx1) {
4056        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4057        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4058    }
4059    if (have_avx2) {
4060        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4061    }
4062
4063    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4064    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4065    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4066    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4067    if (TCG_TARGET_REG_BITS == 64) {
4068#if !defined(_WIN64)
4069        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4070        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4071#endif
4072        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4073        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4074        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4075        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4076    }
4077
4078    s->reserved_regs = 0;
4079    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4080#ifdef _WIN64
4081    /* These are call saved, and we don't save them, so don't use them. */
4082    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4083    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4084    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4085    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4086    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4087    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4088    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4089    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4090    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4091    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4092#endif
4093}
4094
4095typedef struct {
4096    DebugFrameHeader h;
4097    uint8_t fde_def_cfa[4];
4098    uint8_t fde_reg_ofs[14];
4099} DebugFrame;
4100
4101/* We're expecting a 2 byte uleb128 encoded value.  */
4102QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4103
4104#if !defined(__ELF__)
4105    /* Host machine without ELF. */
4106#elif TCG_TARGET_REG_BITS == 64
4107#define ELF_HOST_MACHINE EM_X86_64
4108static const DebugFrame debug_frame = {
4109    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4110    .h.cie.id = -1,
4111    .h.cie.version = 1,
4112    .h.cie.code_align = 1,
4113    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4114    .h.cie.return_column = 16,
4115
4116    /* Total FDE size does not include the "len" member.  */
4117    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4118
4119    .fde_def_cfa = {
4120        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4121        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4122        (FRAME_SIZE >> 7)
4123    },
4124    .fde_reg_ofs = {
4125        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4126        /* The following ordering must match tcg_target_callee_save_regs.  */
4127        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4128        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4129        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4130        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4131        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4132        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4133    }
4134};
4135#else
4136#define ELF_HOST_MACHINE EM_386
4137static const DebugFrame debug_frame = {
4138    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4139    .h.cie.id = -1,
4140    .h.cie.version = 1,
4141    .h.cie.code_align = 1,
4142    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4143    .h.cie.return_column = 8,
4144
4145    /* Total FDE size does not include the "len" member.  */
4146    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4147
4148    .fde_def_cfa = {
4149        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4150        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4151        (FRAME_SIZE >> 7)
4152    },
4153    .fde_reg_ofs = {
4154        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4155        /* The following ordering must match tcg_target_callee_save_regs.  */
4156        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4157        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4158        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4159        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4160    }
4161};
4162#endif
4163
4164#if defined(ELF_HOST_MACHINE)
4165void tcg_register_jit(const void *buf, size_t buf_size)
4166{
4167    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4168}
4169#endif
4170