xref: /qemu/tcg/i386/tcg-target.c.inc (revision 372b69f5)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135
136/* Registers used with L constraint, which are the first argument
137   registers on x86_64, and two random call clobbered registers on
138   i386. */
139#if TCG_TARGET_REG_BITS == 64
140# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
142#else
143# define TCG_REG_L0 TCG_REG_EAX
144# define TCG_REG_L1 TCG_REG_EDX
145#endif
146
147#if TCG_TARGET_REG_BITS == 64
148# define ALL_GENERAL_REGS      0x0000ffffu
149# define ALL_VECTOR_REGS       0xffff0000u
150# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
151#else
152# define ALL_GENERAL_REGS      0x000000ffu
153# define ALL_VECTOR_REGS       0x00ff0000u
154# define ALL_BYTEL_REGS        0x0000000fu
155#endif
156#define SOFTMMU_RESERVE_REGS \
157    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
158
159/* For 64-bit, we always know that CMOV is available.  */
160#if TCG_TARGET_REG_BITS == 64
161# define have_cmov      true
162#else
163# define have_cmov      (cpuinfo & CPUINFO_CMOV)
164#endif
165#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
166#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
167
168static const tcg_insn_unit *tb_ret_addr;
169
170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171                        intptr_t value, intptr_t addend)
172{
173    value += addend;
174    switch(type) {
175    case R_386_PC32:
176        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
177        if (value != (int32_t)value) {
178            return false;
179        }
180        /* FALLTHRU */
181    case R_386_32:
182        tcg_patch32(code_ptr, value);
183        break;
184    case R_386_PC8:
185        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
186        if (value != (int8_t)value) {
187            return false;
188        }
189        tcg_patch8(code_ptr, value);
190        break;
191    default:
192        g_assert_not_reached();
193    }
194    return true;
195}
196
197/* test if a constant matches the constraint */
198static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
199{
200    if (ct & TCG_CT_CONST) {
201        return 1;
202    }
203    if (type == TCG_TYPE_I32) {
204        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
205            return 1;
206        }
207    } else {
208        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
209            return 1;
210        }
211        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
212            return 1;
213        }
214        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
215            return 1;
216        }
217    }
218    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
219        return 1;
220    }
221    return 0;
222}
223
224# define LOWREGMASK(x)	((x) & 7)
225
226#define P_EXT		0x100		/* 0x0f opcode prefix */
227#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
228#define P_DATA16        0x400           /* 0x66 opcode prefix */
229#define P_VEXW          0x1000          /* Set VEX.W = 1 */
230#if TCG_TARGET_REG_BITS == 64
231# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
232# define P_REXB_R       0x2000          /* REG field as byte register */
233# define P_REXB_RM      0x4000          /* R/M field as byte register */
234# define P_GS           0x8000          /* gs segment override */
235#else
236# define P_REXW		0
237# define P_REXB_R	0
238# define P_REXB_RM	0
239# define P_GS           0
240#endif
241#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
242#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
243#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
244#define P_VEXL          0x80000         /* Set VEX.L = 1 */
245#define P_EVEX          0x100000        /* Requires EVEX encoding */
246
247#define OPC_ARITH_EvIz	(0x81)
248#define OPC_ARITH_EvIb	(0x83)
249#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
250#define OPC_ANDN        (0xf2 | P_EXT38)
251#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
252#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
253#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
254#define OPC_BSF         (0xbc | P_EXT)
255#define OPC_BSR         (0xbd | P_EXT)
256#define OPC_BSWAP	(0xc8 | P_EXT)
257#define OPC_CALL_Jz	(0xe8)
258#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
259#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
260#define OPC_DEC_r32	(0x48)
261#define OPC_IMUL_GvEv	(0xaf | P_EXT)
262#define OPC_IMUL_GvEvIb	(0x6b)
263#define OPC_IMUL_GvEvIz	(0x69)
264#define OPC_INC_r32	(0x40)
265#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
266#define OPC_JCC_short	(0x70)		/* ... plus condition code */
267#define OPC_JMP_long	(0xe9)
268#define OPC_JMP_short	(0xeb)
269#define OPC_LEA         (0x8d)
270#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
271#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
272#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
273#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
274#define OPC_MOVB_EvIz   (0xc6)
275#define OPC_MOVL_EvIz	(0xc7)
276#define OPC_MOVB_Ib     (0xb0)
277#define OPC_MOVL_Iv     (0xb8)
278#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
279#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
280#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
281#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
282#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
283#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
284#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
285#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
286#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
287#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
288#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
289#define OPC_MOVSBL	(0xbe | P_EXT)
290#define OPC_MOVSWL	(0xbf | P_EXT)
291#define OPC_MOVSLQ	(0x63 | P_REXW)
292#define OPC_MOVZBL	(0xb6 | P_EXT)
293#define OPC_MOVZWL	(0xb7 | P_EXT)
294#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
295#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
296#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
297#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
298#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
299#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
300#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
301#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
302#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
303#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
304#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
305#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
306#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
307#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
308#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
309#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
310#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
311#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
312#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
313#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
314#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
315#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
316#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
317#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
318#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
319#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
320#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
321#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
322#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
323#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
324#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
325#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
326#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
327#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
328#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
329#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
330#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
331#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
332#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
333#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
334#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
335#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
336#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
337#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
338#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
339#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
340#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
341#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
342#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
343#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
344#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
345#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
346#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
347#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348#define OPC_POR         (0xeb | P_EXT | P_DATA16)
349#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
350#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
351#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
352#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
353#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
354#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
355#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
356#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
357#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
358#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
359#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
360#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
361#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
362#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
363#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
364#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
365#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
366#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
367#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
368#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
369#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
370#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
371#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
372#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
373#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
374#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
375#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
376#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
377#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
378#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
379#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
380#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
381#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
382#define OPC_POP_r32	(0x58)
383#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
384#define OPC_PUSH_r32	(0x50)
385#define OPC_PUSH_Iv	(0x68)
386#define OPC_PUSH_Ib	(0x6a)
387#define OPC_RET		(0xc3)
388#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
389#define OPC_SHIFT_1	(0xd1)
390#define OPC_SHIFT_Ib	(0xc1)
391#define OPC_SHIFT_cl	(0xd3)
392#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
393#define OPC_SHUFPS      (0xc6 | P_EXT)
394#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
395#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
396#define OPC_SHRD_Ib     (0xac | P_EXT)
397#define OPC_TESTL	(0x85)
398#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
399#define OPC_UD2         (0x0b | P_EXT)
400#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
401#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
402#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
403#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
404#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
405#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
406#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
407#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
408#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
409#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
410#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
411#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
412#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
413#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
414#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
415#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
416#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
417#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
418#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
419#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
420#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
421#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
422#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
423#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
424#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
427#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
428#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
429#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
430#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
431#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
433#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
434#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VZEROUPPER  (0x77 | P_EXT)
436#define OPC_XCHG_ax_r32	(0x90)
437#define OPC_XCHG_EvGv   (0x87)
438
439#define OPC_GRP3_Eb     (0xf6)
440#define OPC_GRP3_Ev     (0xf7)
441#define OPC_GRP5        (0xff)
442#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
443
444/* Group 1 opcode extensions for 0x80-0x83.
445   These are also used as modifiers for OPC_ARITH.  */
446#define ARITH_ADD 0
447#define ARITH_OR  1
448#define ARITH_ADC 2
449#define ARITH_SBB 3
450#define ARITH_AND 4
451#define ARITH_SUB 5
452#define ARITH_XOR 6
453#define ARITH_CMP 7
454
455/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
456#define SHIFT_ROL 0
457#define SHIFT_ROR 1
458#define SHIFT_SHL 4
459#define SHIFT_SHR 5
460#define SHIFT_SAR 7
461
462/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
463#define EXT3_TESTi 0
464#define EXT3_NOT   2
465#define EXT3_NEG   3
466#define EXT3_MUL   4
467#define EXT3_IMUL  5
468#define EXT3_DIV   6
469#define EXT3_IDIV  7
470
471/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
472#define EXT5_INC_Ev	0
473#define EXT5_DEC_Ev	1
474#define EXT5_CALLN_Ev	2
475#define EXT5_JMPN_Ev	4
476
477/* Condition codes to be added to OPC_JCC_{long,short}.  */
478#define JCC_JMP (-1)
479#define JCC_JO  0x0
480#define JCC_JNO 0x1
481#define JCC_JB  0x2
482#define JCC_JAE 0x3
483#define JCC_JE  0x4
484#define JCC_JNE 0x5
485#define JCC_JBE 0x6
486#define JCC_JA  0x7
487#define JCC_JS  0x8
488#define JCC_JNS 0x9
489#define JCC_JP  0xa
490#define JCC_JNP 0xb
491#define JCC_JL  0xc
492#define JCC_JGE 0xd
493#define JCC_JLE 0xe
494#define JCC_JG  0xf
495
496static const uint8_t tcg_cond_to_jcc[] = {
497    [TCG_COND_EQ] = JCC_JE,
498    [TCG_COND_NE] = JCC_JNE,
499    [TCG_COND_LT] = JCC_JL,
500    [TCG_COND_GE] = JCC_JGE,
501    [TCG_COND_LE] = JCC_JLE,
502    [TCG_COND_GT] = JCC_JG,
503    [TCG_COND_LTU] = JCC_JB,
504    [TCG_COND_GEU] = JCC_JAE,
505    [TCG_COND_LEU] = JCC_JBE,
506    [TCG_COND_GTU] = JCC_JA,
507};
508
509#if TCG_TARGET_REG_BITS == 64
510static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
511{
512    int rex;
513
514    if (opc & P_GS) {
515        tcg_out8(s, 0x65);
516    }
517    if (opc & P_DATA16) {
518        /* We should never be asking for both 16 and 64-bit operation.  */
519        tcg_debug_assert((opc & P_REXW) == 0);
520        tcg_out8(s, 0x66);
521    }
522    if (opc & P_SIMDF3) {
523        tcg_out8(s, 0xf3);
524    } else if (opc & P_SIMDF2) {
525        tcg_out8(s, 0xf2);
526    }
527
528    rex = 0;
529    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
530    rex |= (r & 8) >> 1;                /* REX.R */
531    rex |= (x & 8) >> 2;                /* REX.X */
532    rex |= (rm & 8) >> 3;               /* REX.B */
533
534    /* P_REXB_{R,RM} indicates that the given register is the low byte.
535       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
536       as otherwise the encoding indicates %[abcd]h.  Note that the values
537       that are ORed in merely indicate that the REX byte must be present;
538       those bits get discarded in output.  */
539    rex |= opc & (r >= 4 ? P_REXB_R : 0);
540    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
541
542    if (rex) {
543        tcg_out8(s, (uint8_t)(rex | 0x40));
544    }
545
546    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
547        tcg_out8(s, 0x0f);
548        if (opc & P_EXT38) {
549            tcg_out8(s, 0x38);
550        } else if (opc & P_EXT3A) {
551            tcg_out8(s, 0x3a);
552        }
553    }
554
555    tcg_out8(s, opc);
556}
557#else
558static void tcg_out_opc(TCGContext *s, int opc)
559{
560    if (opc & P_DATA16) {
561        tcg_out8(s, 0x66);
562    }
563    if (opc & P_SIMDF3) {
564        tcg_out8(s, 0xf3);
565    } else if (opc & P_SIMDF2) {
566        tcg_out8(s, 0xf2);
567    }
568    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
569        tcg_out8(s, 0x0f);
570        if (opc & P_EXT38) {
571            tcg_out8(s, 0x38);
572        } else if (opc & P_EXT3A) {
573            tcg_out8(s, 0x3a);
574        }
575    }
576    tcg_out8(s, opc);
577}
578/* Discard the register arguments to tcg_out_opc early, so as not to penalize
579   the 32-bit compilation paths.  This method works with all versions of gcc,
580   whereas relying on optimization may not be able to exclude them.  */
581#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
582#endif
583
584static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
585{
586    tcg_out_opc(s, opc, r, rm, 0);
587    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
588}
589
590static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
591                            int rm, int index)
592{
593    int tmp;
594
595    if (opc & P_GS) {
596        tcg_out8(s, 0x65);
597    }
598    /* Use the two byte form if possible, which cannot encode
599       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
600    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
601        && ((rm | index) & 8) == 0) {
602        /* Two byte VEX prefix.  */
603        tcg_out8(s, 0xc5);
604
605        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
606    } else {
607        /* Three byte VEX prefix.  */
608        tcg_out8(s, 0xc4);
609
610        /* VEX.m-mmmm */
611        if (opc & P_EXT3A) {
612            tmp = 3;
613        } else if (opc & P_EXT38) {
614            tmp = 2;
615        } else if (opc & P_EXT) {
616            tmp = 1;
617        } else {
618            g_assert_not_reached();
619        }
620        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
621        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
622        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
623        tcg_out8(s, tmp);
624
625        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
626    }
627
628    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
629    /* VEX.pp */
630    if (opc & P_DATA16) {
631        tmp |= 1;                          /* 0x66 */
632    } else if (opc & P_SIMDF3) {
633        tmp |= 2;                          /* 0xf3 */
634    } else if (opc & P_SIMDF2) {
635        tmp |= 3;                          /* 0xf2 */
636    }
637    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
638    tcg_out8(s, tmp);
639    tcg_out8(s, opc);
640}
641
642static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
643                             int rm, int index)
644{
645    /* The entire 4-byte evex prefix; with R' and V' set. */
646    uint32_t p = 0x08041062;
647    int mm, pp;
648
649    tcg_debug_assert(have_avx512vl);
650
651    /* EVEX.mm */
652    if (opc & P_EXT3A) {
653        mm = 3;
654    } else if (opc & P_EXT38) {
655        mm = 2;
656    } else if (opc & P_EXT) {
657        mm = 1;
658    } else {
659        g_assert_not_reached();
660    }
661
662    /* EVEX.pp */
663    if (opc & P_DATA16) {
664        pp = 1;                          /* 0x66 */
665    } else if (opc & P_SIMDF3) {
666        pp = 2;                          /* 0xf3 */
667    } else if (opc & P_SIMDF2) {
668        pp = 3;                          /* 0xf2 */
669    } else {
670        pp = 0;
671    }
672
673    p = deposit32(p, 8, 2, mm);
674    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
675    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
676    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
677    p = deposit32(p, 16, 2, pp);
678    p = deposit32(p, 19, 4, ~v);
679    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
680    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
681
682    tcg_out32(s, p);
683    tcg_out8(s, opc);
684}
685
686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687{
688    if (opc & P_EVEX) {
689        tcg_out_evex_opc(s, opc, r, v, rm, 0);
690    } else {
691        tcg_out_vex_opc(s, opc, r, v, rm, 0);
692    }
693    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
694}
695
696/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
697   We handle either RM and INDEX missing with a negative value.  In 64-bit
698   mode for absolute addresses, ~RM is the size of the immediate operand
699   that will follow the instruction.  */
700
701static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
702                               int shift, intptr_t offset)
703{
704    int mod, len;
705
706    if (index < 0 && rm < 0) {
707        if (TCG_TARGET_REG_BITS == 64) {
708            /* Try for a rip-relative addressing mode.  This has replaced
709               the 32-bit-mode absolute addressing encoding.  */
710            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
711            intptr_t disp = offset - pc;
712            if (disp == (int32_t)disp) {
713                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
714                tcg_out32(s, disp);
715                return;
716            }
717
718            /* Try for an absolute address encoding.  This requires the
719               use of the MODRM+SIB encoding and is therefore larger than
720               rip-relative addressing.  */
721            if (offset == (int32_t)offset) {
722                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
723                tcg_out8(s, (4 << 3) | 5);
724                tcg_out32(s, offset);
725                return;
726            }
727
728            /* ??? The memory isn't directly addressable.  */
729            g_assert_not_reached();
730        } else {
731            /* Absolute address.  */
732            tcg_out8(s, (r << 3) | 5);
733            tcg_out32(s, offset);
734            return;
735        }
736    }
737
738    /* Find the length of the immediate addend.  Note that the encoding
739       that would be used for (%ebp) indicates absolute addressing.  */
740    if (rm < 0) {
741        mod = 0, len = 4, rm = 5;
742    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
743        mod = 0, len = 0;
744    } else if (offset == (int8_t)offset) {
745        mod = 0x40, len = 1;
746    } else {
747        mod = 0x80, len = 4;
748    }
749
750    /* Use a single byte MODRM format if possible.  Note that the encoding
751       that would be used for %esp is the escape to the two byte form.  */
752    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
753        /* Single byte MODRM format.  */
754        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
755    } else {
756        /* Two byte MODRM+SIB format.  */
757
758        /* Note that the encoding that would place %esp into the index
759           field indicates no index register.  In 64-bit mode, the REX.X
760           bit counts, so %r12 can be used as the index.  */
761        if (index < 0) {
762            index = 4;
763        } else {
764            tcg_debug_assert(index != TCG_REG_ESP);
765        }
766
767        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
768        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
769    }
770
771    if (len == 1) {
772        tcg_out8(s, offset);
773    } else if (len == 4) {
774        tcg_out32(s, offset);
775    }
776}
777
778static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
779                                     int index, int shift, intptr_t offset)
780{
781    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
782    tcg_out_sib_offset(s, r, rm, index, shift, offset);
783}
784
785static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
786                                         int rm, int index, int shift,
787                                         intptr_t offset)
788{
789    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
790    tcg_out_sib_offset(s, r, rm, index, shift, offset);
791}
792
793/* A simplification of the above with no index or shift.  */
794static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
795                                        int rm, intptr_t offset)
796{
797    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
798}
799
800static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
801                                            int v, int rm, intptr_t offset)
802{
803    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
804}
805
806/* Output an opcode with an expected reference to the constant pool.  */
807static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
808{
809    tcg_out_opc(s, opc, r, 0, 0);
810    /* Absolute for 32-bit, pc-relative for 64-bit.  */
811    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
812    tcg_out32(s, 0);
813}
814
815/* Output an opcode with an expected reference to the constant pool.  */
816static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
817{
818    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
819    /* Absolute for 32-bit, pc-relative for 64-bit.  */
820    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
821    tcg_out32(s, 0);
822}
823
824/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
825static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
826{
827    /* Propagate an opcode prefix, such as P_REXW.  */
828    int ext = subop & ~0x7;
829    subop &= 0x7;
830
831    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
832}
833
834static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
835{
836    int rexw = 0;
837
838    if (arg == ret) {
839        return true;
840    }
841    switch (type) {
842    case TCG_TYPE_I64:
843        rexw = P_REXW;
844        /* fallthru */
845    case TCG_TYPE_I32:
846        if (ret < 16) {
847            if (arg < 16) {
848                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
849            } else {
850                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
851            }
852        } else {
853            if (arg < 16) {
854                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
855            } else {
856                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
857            }
858        }
859        break;
860
861    case TCG_TYPE_V64:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
864        break;
865    case TCG_TYPE_V128:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
868        break;
869    case TCG_TYPE_V256:
870        tcg_debug_assert(ret >= 16 && arg >= 16);
871        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
872        break;
873
874    default:
875        g_assert_not_reached();
876    }
877    return true;
878}
879
880static const int avx2_dup_insn[4] = {
881    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
882    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
883};
884
885static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
886                            TCGReg r, TCGReg a)
887{
888    if (have_avx2) {
889        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
890        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
891    } else {
892        switch (vece) {
893        case MO_8:
894            /* ??? With zero in a register, use PSHUFB.  */
895            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_16:
899            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
900            a = r;
901            /* FALLTHRU */
902        case MO_32:
903            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
904            /* imm8 operand: all output lanes selected from input lane 0.  */
905            tcg_out8(s, 0);
906            break;
907        case MO_64:
908            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
909            break;
910        default:
911            g_assert_not_reached();
912        }
913    }
914    return true;
915}
916
917static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
918                             TCGReg r, TCGReg base, intptr_t offset)
919{
920    if (have_avx2) {
921        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
922        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
923                                 r, 0, base, offset);
924    } else {
925        switch (vece) {
926        case MO_64:
927            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
928            break;
929        case MO_32:
930            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
931            break;
932        case MO_16:
933            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
934            tcg_out8(s, 0); /* imm8 */
935            tcg_out_dup_vec(s, type, vece, r, r);
936            break;
937        case MO_8:
938            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
939            tcg_out8(s, 0); /* imm8 */
940            tcg_out_dup_vec(s, type, vece, r, r);
941            break;
942        default:
943            g_assert_not_reached();
944        }
945    }
946    return true;
947}
948
949static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
950                             TCGReg ret, int64_t arg)
951{
952    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
953
954    if (arg == 0) {
955        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
956        return;
957    }
958    if (arg == -1) {
959        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
960        return;
961    }
962
963    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
964        if (have_avx2) {
965            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
966        } else {
967            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
968        }
969        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
970    } else {
971        if (type == TCG_TYPE_V64) {
972            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
973        } else if (have_avx2) {
974            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
975        } else {
976            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
977        }
978        if (TCG_TARGET_REG_BITS == 64) {
979            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
980        } else {
981            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
982        }
983    }
984}
985
986static void tcg_out_movi_vec(TCGContext *s, TCGType type,
987                             TCGReg ret, tcg_target_long arg)
988{
989    if (arg == 0) {
990        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
991        return;
992    }
993    if (arg == -1) {
994        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
995        return;
996    }
997
998    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
999    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000    if (TCG_TARGET_REG_BITS == 64) {
1001        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1002    } else {
1003        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1004    }
1005}
1006
1007static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008                             TCGReg ret, tcg_target_long arg)
1009{
1010    tcg_target_long diff;
1011
1012    if (arg == 0) {
1013        tgen_arithr(s, ARITH_XOR, ret, ret);
1014        return;
1015    }
1016    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1018        tcg_out32(s, arg);
1019        return;
1020    }
1021    if (arg == (int32_t)arg) {
1022        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1023        tcg_out32(s, arg);
1024        return;
1025    }
1026
1027    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1028    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029    if (diff == (int32_t)diff) {
1030        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1032        tcg_out32(s, diff);
1033        return;
1034    }
1035
1036    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1037    tcg_out64(s, arg);
1038}
1039
1040static void tcg_out_movi(TCGContext *s, TCGType type,
1041                         TCGReg ret, tcg_target_long arg)
1042{
1043    switch (type) {
1044    case TCG_TYPE_I32:
1045#if TCG_TARGET_REG_BITS == 64
1046    case TCG_TYPE_I64:
1047#endif
1048        if (ret < 16) {
1049            tcg_out_movi_int(s, type, ret, arg);
1050        } else {
1051            tcg_out_movi_vec(s, type, ret, arg);
1052        }
1053        break;
1054    default:
1055        g_assert_not_reached();
1056    }
1057}
1058
1059static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1060{
1061    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1063    return true;
1064}
1065
1066static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067                             tcg_target_long imm)
1068{
1069    /* This function is only used for passing structs by reference. */
1070    tcg_debug_assert(imm == (int32_t)imm);
1071    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1072}
1073
1074static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1075{
1076    if (val == (int8_t)val) {
1077        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1078        tcg_out8(s, val);
1079    } else if (val == (int32_t)val) {
1080        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1081        tcg_out32(s, val);
1082    } else {
1083        g_assert_not_reached();
1084    }
1085}
1086
1087static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1088{
1089    /* Given the strength of x86 memory ordering, we only need care for
1090       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1091       faster than "mfence", so don't bother with the sse insn.  */
1092    if (a0 & TCG_MO_ST_LD) {
1093        tcg_out8(s, 0xf0);
1094        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1095        tcg_out8(s, 0);
1096    }
1097}
1098
1099static inline void tcg_out_push(TCGContext *s, int reg)
1100{
1101    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1102}
1103
1104static inline void tcg_out_pop(TCGContext *s, int reg)
1105{
1106    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1107}
1108
1109static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110                       TCGReg arg1, intptr_t arg2)
1111{
1112    switch (type) {
1113    case TCG_TYPE_I32:
1114        if (ret < 16) {
1115            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1116        } else {
1117            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1118        }
1119        break;
1120    case TCG_TYPE_I64:
1121        if (ret < 16) {
1122            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1123            break;
1124        }
1125        /* FALLTHRU */
1126    case TCG_TYPE_V64:
1127        /* There is no instruction that can validate 8-byte alignment.  */
1128        tcg_debug_assert(ret >= 16);
1129        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1130        break;
1131    case TCG_TYPE_V128:
1132        /*
1133         * The gvec infrastructure is asserts that v128 vector loads
1134         * and stores use a 16-byte aligned offset.  Validate that the
1135         * final pointer is aligned by using an insn that will SIGSEGV.
1136         */
1137        tcg_debug_assert(ret >= 16);
1138        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1139        break;
1140    case TCG_TYPE_V256:
1141        /*
1142         * The gvec infrastructure only requires 16-byte alignment,
1143         * so here we must use an unaligned load.
1144         */
1145        tcg_debug_assert(ret >= 16);
1146        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147                                 ret, 0, arg1, arg2);
1148        break;
1149    default:
1150        g_assert_not_reached();
1151    }
1152}
1153
1154static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155                       TCGReg arg1, intptr_t arg2)
1156{
1157    switch (type) {
1158    case TCG_TYPE_I32:
1159        if (arg < 16) {
1160            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1161        } else {
1162            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1163        }
1164        break;
1165    case TCG_TYPE_I64:
1166        if (arg < 16) {
1167            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1168            break;
1169        }
1170        /* FALLTHRU */
1171    case TCG_TYPE_V64:
1172        /* There is no instruction that can validate 8-byte alignment.  */
1173        tcg_debug_assert(arg >= 16);
1174        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1175        break;
1176    case TCG_TYPE_V128:
1177        /*
1178         * The gvec infrastructure is asserts that v128 vector loads
1179         * and stores use a 16-byte aligned offset.  Validate that the
1180         * final pointer is aligned by using an insn that will SIGSEGV.
1181         *
1182         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183         * for _WIN64, which must have SSE2 but may not have AVX.
1184         */
1185        tcg_debug_assert(arg >= 16);
1186        if (have_avx1) {
1187            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1188        } else {
1189            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1190        }
1191        break;
1192    case TCG_TYPE_V256:
1193        /*
1194         * The gvec infrastructure only requires 16-byte alignment,
1195         * so here we must use an unaligned store.
1196         */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199                                 arg, 0, arg1, arg2);
1200        break;
1201    default:
1202        g_assert_not_reached();
1203    }
1204}
1205
1206static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207                        TCGReg base, intptr_t ofs)
1208{
1209    int rexw = 0;
1210    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211        if (val != (int32_t)val) {
1212            return false;
1213        }
1214        rexw = P_REXW;
1215    } else if (type != TCG_TYPE_I32) {
1216        return false;
1217    }
1218    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1219    tcg_out32(s, val);
1220    return true;
1221}
1222
1223static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1224{
1225    /* Propagate an opcode prefix, such as P_DATA16.  */
1226    int ext = subopc & ~0x7;
1227    subopc &= 0x7;
1228
1229    if (count == 1) {
1230        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1231    } else {
1232        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1233        tcg_out8(s, count);
1234    }
1235}
1236
1237static inline void tcg_out_bswap32(TCGContext *s, int reg)
1238{
1239    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1240}
1241
1242static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1243{
1244    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1245}
1246
1247static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1248{
1249    /* movzbl */
1250    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1252}
1253
1254static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1255{
1256    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1257    /* movsbl */
1258    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1260}
1261
1262static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1263{
1264    /* movzwl */
1265    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1266}
1267
1268static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1269{
1270    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1271    /* movsw[lq] */
1272    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1273}
1274
1275static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1276{
1277    /* 32-bit mov zero extends.  */
1278    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1279}
1280
1281static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1282{
1283    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1285}
1286
1287static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1288{
1289    tcg_out_ext32s(s, dest, src);
1290}
1291
1292static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1293{
1294    if (dest != src) {
1295        tcg_out_ext32u(s, dest, src);
1296    }
1297}
1298
1299static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    tcg_out_ext32u(s, dest, src);
1302}
1303
1304static inline void tcg_out_bswap64(TCGContext *s, int reg)
1305{
1306    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1307}
1308
1309static void tgen_arithi(TCGContext *s, int c, int r0,
1310                        tcg_target_long val, int cf)
1311{
1312    int rexw = 0;
1313
1314    if (TCG_TARGET_REG_BITS == 64) {
1315        rexw = c & -8;
1316        c &= 7;
1317    }
1318
1319    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1320       partial flags update stalls on Pentium4 and are not recommended
1321       by current Intel optimization manuals.  */
1322    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1323        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1324        if (TCG_TARGET_REG_BITS == 64) {
1325            /* The single-byte increment encodings are re-tasked as the
1326               REX prefixes.  Use the MODRM encoding.  */
1327            tcg_out_modrm(s, OPC_GRP5 + rexw,
1328                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1329        } else {
1330            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1331        }
1332        return;
1333    }
1334
1335    if (c == ARITH_AND) {
1336        if (TCG_TARGET_REG_BITS == 64) {
1337            if (val == 0xffffffffu) {
1338                tcg_out_ext32u(s, r0, r0);
1339                return;
1340            }
1341            if (val == (uint32_t)val) {
1342                /* AND with no high bits set can use a 32-bit operation.  */
1343                rexw = 0;
1344            }
1345        }
1346        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1347            tcg_out_ext8u(s, r0, r0);
1348            return;
1349        }
1350        if (val == 0xffffu) {
1351            tcg_out_ext16u(s, r0, r0);
1352            return;
1353        }
1354    }
1355
1356    if (val == (int8_t)val) {
1357        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1358        tcg_out8(s, val);
1359        return;
1360    }
1361    if (rexw == 0 || val == (int32_t)val) {
1362        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1363        tcg_out32(s, val);
1364        return;
1365    }
1366
1367    g_assert_not_reached();
1368}
1369
1370static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1371{
1372    if (val != 0) {
1373        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1374    }
1375}
1376
1377/* Set SMALL to force a short forward branch.  */
1378static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1379{
1380    int32_t val, val1;
1381
1382    if (l->has_value) {
1383        val = tcg_pcrel_diff(s, l->u.value_ptr);
1384        val1 = val - 2;
1385        if ((int8_t)val1 == val1) {
1386            if (opc == -1) {
1387                tcg_out8(s, OPC_JMP_short);
1388            } else {
1389                tcg_out8(s, OPC_JCC_short + opc);
1390            }
1391            tcg_out8(s, val1);
1392        } else {
1393            tcg_debug_assert(!small);
1394            if (opc == -1) {
1395                tcg_out8(s, OPC_JMP_long);
1396                tcg_out32(s, val - 5);
1397            } else {
1398                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1399                tcg_out32(s, val - 6);
1400            }
1401        }
1402    } else if (small) {
1403        if (opc == -1) {
1404            tcg_out8(s, OPC_JMP_short);
1405        } else {
1406            tcg_out8(s, OPC_JCC_short + opc);
1407        }
1408        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1409        s->code_ptr += 1;
1410    } else {
1411        if (opc == -1) {
1412            tcg_out8(s, OPC_JMP_long);
1413        } else {
1414            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1415        }
1416        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1417        s->code_ptr += 4;
1418    }
1419}
1420
1421static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1422                        int const_arg2, int rexw)
1423{
1424    if (const_arg2) {
1425        if (arg2 == 0) {
1426            /* test r, r */
1427            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1428        } else {
1429            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1430        }
1431    } else {
1432        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1433    }
1434}
1435
1436static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1437                           TCGArg arg1, TCGArg arg2, int const_arg2,
1438                           TCGLabel *label, bool small)
1439{
1440    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1441    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1442}
1443
1444#if TCG_TARGET_REG_BITS == 32
1445static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1446                            const int *const_args, bool small)
1447{
1448    TCGLabel *label_next = gen_new_label();
1449    TCGLabel *label_this = arg_label(args[5]);
1450
1451    switch(args[4]) {
1452    case TCG_COND_EQ:
1453        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1454                       label_next, 1);
1455        tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1456                       label_this, small);
1457        break;
1458    case TCG_COND_NE:
1459        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1460                       label_this, small);
1461        tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1462                       label_this, small);
1463        break;
1464    case TCG_COND_LT:
1465        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1466                       label_this, small);
1467        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1468        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1469                       label_this, small);
1470        break;
1471    case TCG_COND_LE:
1472        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1473                       label_this, small);
1474        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1475        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1476                       label_this, small);
1477        break;
1478    case TCG_COND_GT:
1479        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1480                       label_this, small);
1481        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1482        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1483                       label_this, small);
1484        break;
1485    case TCG_COND_GE:
1486        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1487                       label_this, small);
1488        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1489        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1490                       label_this, small);
1491        break;
1492    case TCG_COND_LTU:
1493        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1494                       label_this, small);
1495        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1496        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1497                       label_this, small);
1498        break;
1499    case TCG_COND_LEU:
1500        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1501                       label_this, small);
1502        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1503        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1504                       label_this, small);
1505        break;
1506    case TCG_COND_GTU:
1507        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1508                       label_this, small);
1509        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1510        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1511                       label_this, small);
1512        break;
1513    case TCG_COND_GEU:
1514        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1515                       label_this, small);
1516        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1517        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1518                       label_this, small);
1519        break;
1520    default:
1521        g_assert_not_reached();
1522    }
1523    tcg_out_label(s, label_next);
1524}
1525#endif
1526
1527static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1528                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1529                            int const_arg2, bool neg)
1530{
1531    bool inv = false;
1532    bool cleared;
1533
1534    switch (cond) {
1535    case TCG_COND_NE:
1536        inv = true;
1537        /* fall through */
1538    case TCG_COND_EQ:
1539        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1540        if (const_arg2 && arg2 == 0) {
1541            arg2 = 1;
1542            goto do_ltu;
1543        }
1544        break;
1545
1546    case TCG_COND_LEU:
1547        inv = true;
1548        /* fall through */
1549    case TCG_COND_GTU:
1550        /* If arg2 is a register, swap for LTU/GEU. */
1551        if (!const_arg2) {
1552            TCGReg t = arg1;
1553            arg1 = arg2;
1554            arg2 = t;
1555            goto do_ltu;
1556        }
1557        break;
1558
1559    case TCG_COND_GEU:
1560        inv = true;
1561        /* fall through */
1562    case TCG_COND_LTU:
1563    do_ltu:
1564        /*
1565         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1566         * We can then use NEG or INC to produce the desired result.
1567         * This is always smaller than the SETCC expansion.
1568         */
1569        tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1570
1571        /* X - X - C = -C = (C ? -1 : 0) */
1572        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1573        if (inv && neg) {
1574            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1575            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1576        } else if (inv) {
1577            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1578            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1579        } else if (!neg) {
1580            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1581            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1582        }
1583        return;
1584
1585    case TCG_COND_GE:
1586        inv = true;
1587        /* fall through */
1588    case TCG_COND_LT:
1589        /* If arg2 is 0, extract the sign bit. */
1590        if (const_arg2 && arg2 == 0) {
1591            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1592            if (inv) {
1593                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1594            }
1595            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1596                           dest, rexw ? 63 : 31);
1597            return;
1598        }
1599        break;
1600
1601    default:
1602        break;
1603    }
1604
1605    /*
1606     * If dest does not overlap the inputs, clearing it first is preferred.
1607     * The XOR breaks any false dependency for the low-byte write to dest,
1608     * and is also one byte smaller than MOVZBL.
1609     */
1610    cleared = false;
1611    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1612        tgen_arithr(s, ARITH_XOR, dest, dest);
1613        cleared = true;
1614    }
1615
1616    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1617    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1618
1619    if (!cleared) {
1620        tcg_out_ext8u(s, dest, dest);
1621    }
1622    if (neg) {
1623        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1624    }
1625}
1626
1627#if TCG_TARGET_REG_BITS == 32
1628static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1629                             const int *const_args)
1630{
1631    TCGArg new_args[6];
1632    TCGLabel *label_true, *label_over;
1633
1634    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1635
1636    if (args[0] == args[1] || args[0] == args[2]
1637        || (!const_args[3] && args[0] == args[3])
1638        || (!const_args[4] && args[0] == args[4])) {
1639        /* When the destination overlaps with one of the argument
1640           registers, don't do anything tricky.  */
1641        label_true = gen_new_label();
1642        label_over = gen_new_label();
1643
1644        new_args[5] = label_arg(label_true);
1645        tcg_out_brcond2(s, new_args, const_args+1, 1);
1646
1647        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1648        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1649        tcg_out_label(s, label_true);
1650
1651        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1652        tcg_out_label(s, label_over);
1653    } else {
1654        /* When the destination does not overlap one of the arguments,
1655           clear the destination first, jump if cond false, and emit an
1656           increment in the true case.  This results in smaller code.  */
1657
1658        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1659
1660        label_over = gen_new_label();
1661        new_args[4] = tcg_invert_cond(new_args[4]);
1662        new_args[5] = label_arg(label_over);
1663        tcg_out_brcond2(s, new_args, const_args+1, 1);
1664
1665        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1666        tcg_out_label(s, label_over);
1667    }
1668}
1669#endif
1670
1671static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1672                         TCGReg dest, TCGReg v1)
1673{
1674    if (have_cmov) {
1675        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1676    } else {
1677        TCGLabel *over = gen_new_label();
1678        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1679        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1680        tcg_out_label(s, over);
1681    }
1682}
1683
1684static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1685                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1686                            TCGReg v1)
1687{
1688    tcg_out_cmp(s, c1, c2, const_c2, rexw);
1689    tcg_out_cmov(s, cond, rexw, dest, v1);
1690}
1691
1692static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1693                        TCGArg arg2, bool const_a2)
1694{
1695    if (have_bmi1) {
1696        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1697        if (const_a2) {
1698            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1699        } else {
1700            tcg_debug_assert(dest != arg2);
1701            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1702        }
1703    } else {
1704        tcg_debug_assert(dest != arg2);
1705        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1706        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1707    }
1708}
1709
1710static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1711                        TCGArg arg2, bool const_a2)
1712{
1713    if (have_lzcnt) {
1714        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1715        if (const_a2) {
1716            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1717        } else {
1718            tcg_debug_assert(dest != arg2);
1719            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1720        }
1721    } else {
1722        tcg_debug_assert(!const_a2);
1723        tcg_debug_assert(dest != arg1);
1724        tcg_debug_assert(dest != arg2);
1725
1726        /* Recall that the output of BSR is the index not the count.  */
1727        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1728        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1729
1730        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1731        tcg_out_cmp(s, arg1, 0, 1, rexw);
1732        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1733    }
1734}
1735
1736static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1737{
1738    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1739
1740    if (disp == (int32_t)disp) {
1741        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1742        tcg_out32(s, disp);
1743    } else {
1744        /* rip-relative addressing into the constant pool.
1745           This is 6 + 8 = 14 bytes, as compared to using an
1746           immediate load 10 + 6 = 16 bytes, plus we may
1747           be able to re-use the pool constant for more calls.  */
1748        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1749        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1750        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1751        tcg_out32(s, 0);
1752    }
1753}
1754
1755static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1756                         const TCGHelperInfo *info)
1757{
1758    tcg_out_branch(s, 1, dest);
1759
1760#ifndef _WIN32
1761    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1762        /*
1763         * The sysv i386 abi for struct return places a reference as the
1764         * first argument of the stack, and pops that argument with the
1765         * return statement.  Since we want to retain the aligned stack
1766         * pointer for the callee, we do not want to actually push that
1767         * argument before the call but rely on the normal store to the
1768         * stack slot.  But we do need to compensate for the pop in order
1769         * to reset our correct stack pointer value.
1770         * Pushing a garbage value back onto the stack is quickest.
1771         */
1772        tcg_out_push(s, TCG_REG_EAX);
1773    }
1774#endif
1775}
1776
1777static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1778{
1779    tcg_out_branch(s, 0, dest);
1780}
1781
1782static void tcg_out_nopn(TCGContext *s, int n)
1783{
1784    int i;
1785    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1786     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1787     * duplicate prefix, and all of the interesting recent cores can
1788     * decode and discard the duplicates in a single cycle.
1789     */
1790    tcg_debug_assert(n >= 1);
1791    for (i = 1; i < n; ++i) {
1792        tcg_out8(s, 0x66);
1793    }
1794    tcg_out8(s, 0x90);
1795}
1796
1797/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1798static void __attribute__((unused))
1799tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1800{
1801    /*
1802     * This is used for testing alignment, so we can usually use testb.
1803     * For i686, we have to use testl for %esi/%edi.
1804     */
1805    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1806        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1807        tcg_out8(s, i);
1808    } else {
1809        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1810        tcg_out32(s, i);
1811    }
1812}
1813
1814typedef struct {
1815    TCGReg base;
1816    int index;
1817    int ofs;
1818    int seg;
1819    TCGAtomAlign aa;
1820} HostAddress;
1821
1822bool tcg_target_has_memory_bswap(MemOp memop)
1823{
1824    TCGAtomAlign aa;
1825
1826    if (!have_movbe) {
1827        return false;
1828    }
1829    if ((memop & MO_SIZE) < MO_128) {
1830        return true;
1831    }
1832
1833    /*
1834     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1835     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1836     */
1837    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1838    return aa.atom < MO_128;
1839}
1840
1841/*
1842 * Because i686 has no register parameters and because x86_64 has xchg
1843 * to handle addr/data register overlap, we have placed all input arguments
1844 * before we need might need a scratch reg.
1845 *
1846 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1847 * a general-purpose scratch when we don't actually know it's available,
1848 * use the ra_gen hook to load into RAX if needed.
1849 */
1850#if TCG_TARGET_REG_BITS == 64
1851static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1852{
1853    if (arg < 0) {
1854        arg = TCG_REG_RAX;
1855    }
1856    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1857    return arg;
1858}
1859static const TCGLdstHelperParam ldst_helper_param = {
1860    .ra_gen = ldst_ra_gen
1861};
1862#else
1863static const TCGLdstHelperParam ldst_helper_param = { };
1864#endif
1865
1866static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1867                                TCGReg l, TCGReg h, TCGReg v)
1868{
1869    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1870
1871    /* vpmov{d,q} %v, %l */
1872    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1873    /* vpextr{d,q} $1, %v, %h */
1874    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1875    tcg_out8(s, 1);
1876}
1877
1878static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1879                                TCGReg v, TCGReg l, TCGReg h)
1880{
1881    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1882
1883    /* vmov{d,q} %l, %v */
1884    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1885    /* vpinsr{d,q} $1, %h, %v, %v */
1886    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1887    tcg_out8(s, 1);
1888}
1889
1890/*
1891 * Generate code for the slow path for a load at the end of block
1892 */
1893static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1894{
1895    MemOp opc = get_memop(l->oi);
1896    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1897
1898    /* resolve label address */
1899    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1900    if (label_ptr[1]) {
1901        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1902    }
1903
1904    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1905    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1906    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1907
1908    tcg_out_jmp(s, l->raddr);
1909    return true;
1910}
1911
1912/*
1913 * Generate code for the slow path for a store at the end of block
1914 */
1915static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1916{
1917    MemOp opc = get_memop(l->oi);
1918    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1919
1920    /* resolve label address */
1921    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1922    if (label_ptr[1]) {
1923        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1924    }
1925
1926    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1927    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1928
1929    tcg_out_jmp(s, l->raddr);
1930    return true;
1931}
1932
1933#ifdef CONFIG_USER_ONLY
1934static HostAddress x86_guest_base = {
1935    .index = -1
1936};
1937
1938#if defined(__x86_64__) && defined(__linux__)
1939# include <asm/prctl.h>
1940# include <sys/prctl.h>
1941int arch_prctl(int code, unsigned long addr);
1942static inline int setup_guest_base_seg(void)
1943{
1944    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1945        return P_GS;
1946    }
1947    return 0;
1948}
1949#define setup_guest_base_seg  setup_guest_base_seg
1950#elif defined(__x86_64__) && \
1951      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1952# include <machine/sysarch.h>
1953static inline int setup_guest_base_seg(void)
1954{
1955    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1956        return P_GS;
1957    }
1958    return 0;
1959}
1960#define setup_guest_base_seg  setup_guest_base_seg
1961#endif
1962#else
1963# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1964#endif /* CONFIG_USER_ONLY */
1965#ifndef setup_guest_base_seg
1966# define setup_guest_base_seg()  0
1967#endif
1968
1969#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1970
1971/*
1972 * For softmmu, perform the TLB load and compare.
1973 * For useronly, perform any required alignment tests.
1974 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1975 * is required and fill in @h with the host address for the fast path.
1976 */
1977static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1978                                           TCGReg addrlo, TCGReg addrhi,
1979                                           MemOpIdx oi, bool is_ld)
1980{
1981    TCGLabelQemuLdst *ldst = NULL;
1982    MemOp opc = get_memop(oi);
1983    MemOp s_bits = opc & MO_SIZE;
1984    unsigned a_mask;
1985
1986    if (tcg_use_softmmu) {
1987        h->index = TCG_REG_L0;
1988        h->ofs = 0;
1989        h->seg = 0;
1990    } else {
1991        *h = x86_guest_base;
1992    }
1993    h->base = addrlo;
1994    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
1995    a_mask = (1 << h->aa.align) - 1;
1996
1997    if (tcg_use_softmmu) {
1998        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1999                            : offsetof(CPUTLBEntry, addr_write);
2000        TCGType ttype = TCG_TYPE_I32;
2001        TCGType tlbtype = TCG_TYPE_I32;
2002        int trexw = 0, hrexw = 0, tlbrexw = 0;
2003        unsigned mem_index = get_mmuidx(oi);
2004        unsigned s_mask = (1 << s_bits) - 1;
2005        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2006        int tlb_mask;
2007
2008        ldst = new_ldst_label(s);
2009        ldst->is_ld = is_ld;
2010        ldst->oi = oi;
2011        ldst->addrlo_reg = addrlo;
2012        ldst->addrhi_reg = addrhi;
2013
2014        if (TCG_TARGET_REG_BITS == 64) {
2015            ttype = s->addr_type;
2016            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2017            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2018                hrexw = P_REXW;
2019                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2020                    tlbtype = TCG_TYPE_I64;
2021                    tlbrexw = P_REXW;
2022                }
2023            }
2024        }
2025
2026        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2027        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2028                       s->page_bits - CPU_TLB_ENTRY_BITS);
2029
2030        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2031                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2032
2033        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2034                             fast_ofs + offsetof(CPUTLBDescFast, table));
2035
2036        /*
2037         * If the required alignment is at least as large as the access,
2038         * simply copy the address and mask.  For lesser alignments,
2039         * check that we don't cross pages for the complete access.
2040         */
2041        if (a_mask >= s_mask) {
2042            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2043        } else {
2044            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2045                                 addrlo, s_mask - a_mask);
2046        }
2047        tlb_mask = s->page_mask | a_mask;
2048        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2049
2050        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2051        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2052                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2053
2054        /* jne slow_path */
2055        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2056        ldst->label_ptr[0] = s->code_ptr;
2057        s->code_ptr += 4;
2058
2059        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2060            /* cmp 4(TCG_REG_L0), addrhi */
2061            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2062                                 TCG_REG_L0, cmp_ofs + 4);
2063
2064            /* jne slow_path */
2065            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2066            ldst->label_ptr[1] = s->code_ptr;
2067            s->code_ptr += 4;
2068        }
2069
2070        /* TLB Hit.  */
2071        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2072                   offsetof(CPUTLBEntry, addend));
2073    } else if (a_mask) {
2074        ldst = new_ldst_label(s);
2075
2076        ldst->is_ld = is_ld;
2077        ldst->oi = oi;
2078        ldst->addrlo_reg = addrlo;
2079        ldst->addrhi_reg = addrhi;
2080
2081        tcg_out_testi(s, addrlo, a_mask);
2082        /* jne slow_path */
2083        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2084        ldst->label_ptr[0] = s->code_ptr;
2085        s->code_ptr += 4;
2086    }
2087
2088    return ldst;
2089}
2090
2091static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2092                                   HostAddress h, TCGType type, MemOp memop)
2093{
2094    bool use_movbe = false;
2095    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2096    int movop = OPC_MOVL_GvEv;
2097
2098    /* Do big-endian loads with movbe.  */
2099    if (memop & MO_BSWAP) {
2100        tcg_debug_assert(have_movbe);
2101        use_movbe = true;
2102        movop = OPC_MOVBE_GyMy;
2103    }
2104
2105    switch (memop & MO_SSIZE) {
2106    case MO_UB:
2107        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2108                                 h.base, h.index, 0, h.ofs);
2109        break;
2110    case MO_SB:
2111        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2112                                 h.base, h.index, 0, h.ofs);
2113        break;
2114    case MO_UW:
2115        if (use_movbe) {
2116            /* There is no extending movbe; only low 16-bits are modified.  */
2117            if (datalo != h.base && datalo != h.index) {
2118                /* XOR breaks dependency chains.  */
2119                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2120                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2121                                         datalo, h.base, h.index, 0, h.ofs);
2122            } else {
2123                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2124                                         datalo, h.base, h.index, 0, h.ofs);
2125                tcg_out_ext16u(s, datalo, datalo);
2126            }
2127        } else {
2128            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2129                                     h.base, h.index, 0, h.ofs);
2130        }
2131        break;
2132    case MO_SW:
2133        if (use_movbe) {
2134            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2135                                     datalo, h.base, h.index, 0, h.ofs);
2136            tcg_out_ext16s(s, type, datalo, datalo);
2137        } else {
2138            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2139                                     datalo, h.base, h.index, 0, h.ofs);
2140        }
2141        break;
2142    case MO_UL:
2143        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2144                                 h.base, h.index, 0, h.ofs);
2145        break;
2146#if TCG_TARGET_REG_BITS == 64
2147    case MO_SL:
2148        if (use_movbe) {
2149            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2150                                     h.base, h.index, 0, h.ofs);
2151            tcg_out_ext32s(s, datalo, datalo);
2152        } else {
2153            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2154                                     h.base, h.index, 0, h.ofs);
2155        }
2156        break;
2157#endif
2158    case MO_UQ:
2159        if (TCG_TARGET_REG_BITS == 64) {
2160            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2161                                     h.base, h.index, 0, h.ofs);
2162            break;
2163        }
2164        if (use_movbe) {
2165            TCGReg t = datalo;
2166            datalo = datahi;
2167            datahi = t;
2168        }
2169        if (h.base == datalo || h.index == datalo) {
2170            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2171                                     h.base, h.index, 0, h.ofs);
2172            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2173            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2174        } else {
2175            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2176                                     h.base, h.index, 0, h.ofs);
2177            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2178                                     h.base, h.index, 0, h.ofs + 4);
2179        }
2180        break;
2181
2182    case MO_128:
2183        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2184
2185        /*
2186         * Without 16-byte atomicity, use integer regs.
2187         * That is where we want the data, and it allows bswaps.
2188         */
2189        if (h.aa.atom < MO_128) {
2190            if (use_movbe) {
2191                TCGReg t = datalo;
2192                datalo = datahi;
2193                datahi = t;
2194            }
2195            if (h.base == datalo || h.index == datalo) {
2196                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2197                                         h.base, h.index, 0, h.ofs);
2198                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2199                                     datalo, datahi, 0);
2200                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2201                                     datahi, datahi, 8);
2202            } else {
2203                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2204                                         h.base, h.index, 0, h.ofs);
2205                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2206                                         h.base, h.index, 0, h.ofs + 8);
2207            }
2208            break;
2209        }
2210
2211        /*
2212         * With 16-byte atomicity, a vector load is required.
2213         * If we already have 16-byte alignment, then VMOVDQA always works.
2214         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2215         * Else use we require a runtime test for alignment for VMOVDQA;
2216         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2217         */
2218        if (h.aa.align >= MO_128) {
2219            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2220                                         TCG_TMP_VEC, 0,
2221                                         h.base, h.index, 0, h.ofs);
2222        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2223            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2224                                         TCG_TMP_VEC, 0,
2225                                         h.base, h.index, 0, h.ofs);
2226        } else {
2227            TCGLabel *l1 = gen_new_label();
2228            TCGLabel *l2 = gen_new_label();
2229
2230            tcg_out_testi(s, h.base, 15);
2231            tcg_out_jxx(s, JCC_JNE, l1, true);
2232
2233            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2234                                         TCG_TMP_VEC, 0,
2235                                         h.base, h.index, 0, h.ofs);
2236            tcg_out_jxx(s, JCC_JMP, l2, true);
2237
2238            tcg_out_label(s, l1);
2239            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2240                                         TCG_TMP_VEC, 0,
2241                                         h.base, h.index, 0, h.ofs);
2242            tcg_out_label(s, l2);
2243        }
2244        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2245        break;
2246
2247    default:
2248        g_assert_not_reached();
2249    }
2250}
2251
2252static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2253                            TCGReg addrlo, TCGReg addrhi,
2254                            MemOpIdx oi, TCGType data_type)
2255{
2256    TCGLabelQemuLdst *ldst;
2257    HostAddress h;
2258
2259    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2260    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2261
2262    if (ldst) {
2263        ldst->type = data_type;
2264        ldst->datalo_reg = datalo;
2265        ldst->datahi_reg = datahi;
2266        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2267    }
2268}
2269
2270static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2271                                   HostAddress h, MemOp memop)
2272{
2273    bool use_movbe = false;
2274    int movop = OPC_MOVL_EvGv;
2275
2276    /*
2277     * Do big-endian stores with movbe or system-mode.
2278     * User-only without movbe will have its swapping done generically.
2279     */
2280    if (memop & MO_BSWAP) {
2281        tcg_debug_assert(have_movbe);
2282        use_movbe = true;
2283        movop = OPC_MOVBE_MyGy;
2284    }
2285
2286    switch (memop & MO_SIZE) {
2287    case MO_8:
2288        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2289        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2290        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2291                                 datalo, h.base, h.index, 0, h.ofs);
2292        break;
2293    case MO_16:
2294        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2295                                 h.base, h.index, 0, h.ofs);
2296        break;
2297    case MO_32:
2298        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2299                                 h.base, h.index, 0, h.ofs);
2300        break;
2301    case MO_64:
2302        if (TCG_TARGET_REG_BITS == 64) {
2303            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2304                                     h.base, h.index, 0, h.ofs);
2305        } else {
2306            if (use_movbe) {
2307                TCGReg t = datalo;
2308                datalo = datahi;
2309                datahi = t;
2310            }
2311            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2312                                     h.base, h.index, 0, h.ofs);
2313            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2314                                     h.base, h.index, 0, h.ofs + 4);
2315        }
2316        break;
2317
2318    case MO_128:
2319        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2320
2321        /*
2322         * Without 16-byte atomicity, use integer regs.
2323         * That is where we have the data, and it allows bswaps.
2324         */
2325        if (h.aa.atom < MO_128) {
2326            if (use_movbe) {
2327                TCGReg t = datalo;
2328                datalo = datahi;
2329                datahi = t;
2330            }
2331            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2332                                     h.base, h.index, 0, h.ofs);
2333            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2334                                     h.base, h.index, 0, h.ofs + 8);
2335            break;
2336        }
2337
2338        /*
2339         * With 16-byte atomicity, a vector store is required.
2340         * If we already have 16-byte alignment, then VMOVDQA always works.
2341         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2342         * Else use we require a runtime test for alignment for VMOVDQA;
2343         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2344         */
2345        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2346        if (h.aa.align >= MO_128) {
2347            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2348                                         TCG_TMP_VEC, 0,
2349                                         h.base, h.index, 0, h.ofs);
2350        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2351            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2352                                         TCG_TMP_VEC, 0,
2353                                         h.base, h.index, 0, h.ofs);
2354        } else {
2355            TCGLabel *l1 = gen_new_label();
2356            TCGLabel *l2 = gen_new_label();
2357
2358            tcg_out_testi(s, h.base, 15);
2359            tcg_out_jxx(s, JCC_JNE, l1, true);
2360
2361            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2362                                         TCG_TMP_VEC, 0,
2363                                         h.base, h.index, 0, h.ofs);
2364            tcg_out_jxx(s, JCC_JMP, l2, true);
2365
2366            tcg_out_label(s, l1);
2367            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2368                                         TCG_TMP_VEC, 0,
2369                                         h.base, h.index, 0, h.ofs);
2370            tcg_out_label(s, l2);
2371        }
2372        break;
2373
2374    default:
2375        g_assert_not_reached();
2376    }
2377}
2378
2379static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2380                            TCGReg addrlo, TCGReg addrhi,
2381                            MemOpIdx oi, TCGType data_type)
2382{
2383    TCGLabelQemuLdst *ldst;
2384    HostAddress h;
2385
2386    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2387    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2388
2389    if (ldst) {
2390        ldst->type = data_type;
2391        ldst->datalo_reg = datalo;
2392        ldst->datahi_reg = datahi;
2393        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2394    }
2395}
2396
2397static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2398{
2399    /* Reuse the zeroing that exists for goto_ptr.  */
2400    if (a0 == 0) {
2401        tcg_out_jmp(s, tcg_code_gen_epilogue);
2402    } else {
2403        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2404        tcg_out_jmp(s, tb_ret_addr);
2405    }
2406}
2407
2408static void tcg_out_goto_tb(TCGContext *s, int which)
2409{
2410    /*
2411     * Jump displacement must be aligned for atomic patching;
2412     * see if we need to add extra nops before jump
2413     */
2414    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2415    if (gap != 1) {
2416        tcg_out_nopn(s, gap - 1);
2417    }
2418    tcg_out8(s, OPC_JMP_long); /* jmp im */
2419    set_jmp_insn_offset(s, which);
2420    tcg_out32(s, 0);
2421    set_jmp_reset_offset(s, which);
2422}
2423
2424void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2425                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2426{
2427    /* patch the branch destination */
2428    uintptr_t addr = tb->jmp_target_addr[n];
2429    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2430    /* no need to flush icache explicitly */
2431}
2432
2433static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2434                              const TCGArg args[TCG_MAX_OP_ARGS],
2435                              const int const_args[TCG_MAX_OP_ARGS])
2436{
2437    TCGArg a0, a1, a2;
2438    int c, const_a2, vexop, rexw = 0;
2439
2440#if TCG_TARGET_REG_BITS == 64
2441# define OP_32_64(x) \
2442        case glue(glue(INDEX_op_, x), _i64): \
2443            rexw = P_REXW; /* FALLTHRU */    \
2444        case glue(glue(INDEX_op_, x), _i32)
2445#else
2446# define OP_32_64(x) \
2447        case glue(glue(INDEX_op_, x), _i32)
2448#endif
2449
2450    /* Hoist the loads of the most common arguments.  */
2451    a0 = args[0];
2452    a1 = args[1];
2453    a2 = args[2];
2454    const_a2 = const_args[2];
2455
2456    switch (opc) {
2457    case INDEX_op_goto_ptr:
2458        /* jmp to the given host address (could be epilogue) */
2459        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2460        break;
2461    case INDEX_op_br:
2462        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2463        break;
2464    OP_32_64(ld8u):
2465        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2466        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2467        break;
2468    OP_32_64(ld8s):
2469        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2470        break;
2471    OP_32_64(ld16u):
2472        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2473        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2474        break;
2475    OP_32_64(ld16s):
2476        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2477        break;
2478#if TCG_TARGET_REG_BITS == 64
2479    case INDEX_op_ld32u_i64:
2480#endif
2481    case INDEX_op_ld_i32:
2482        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2483        break;
2484
2485    OP_32_64(st8):
2486        if (const_args[0]) {
2487            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2488            tcg_out8(s, a0);
2489        } else {
2490            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2491        }
2492        break;
2493    OP_32_64(st16):
2494        if (const_args[0]) {
2495            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2496            tcg_out16(s, a0);
2497        } else {
2498            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2499        }
2500        break;
2501#if TCG_TARGET_REG_BITS == 64
2502    case INDEX_op_st32_i64:
2503#endif
2504    case INDEX_op_st_i32:
2505        if (const_args[0]) {
2506            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2507            tcg_out32(s, a0);
2508        } else {
2509            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2510        }
2511        break;
2512
2513    OP_32_64(add):
2514        /* For 3-operand addition, use LEA.  */
2515        if (a0 != a1) {
2516            TCGArg c3 = 0;
2517            if (const_a2) {
2518                c3 = a2, a2 = -1;
2519            } else if (a0 == a2) {
2520                /* Watch out for dest = src + dest, since we've removed
2521                   the matching constraint on the add.  */
2522                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2523                break;
2524            }
2525
2526            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2527            break;
2528        }
2529        c = ARITH_ADD;
2530        goto gen_arith;
2531    OP_32_64(sub):
2532        c = ARITH_SUB;
2533        goto gen_arith;
2534    OP_32_64(and):
2535        c = ARITH_AND;
2536        goto gen_arith;
2537    OP_32_64(or):
2538        c = ARITH_OR;
2539        goto gen_arith;
2540    OP_32_64(xor):
2541        c = ARITH_XOR;
2542        goto gen_arith;
2543    gen_arith:
2544        if (const_a2) {
2545            tgen_arithi(s, c + rexw, a0, a2, 0);
2546        } else {
2547            tgen_arithr(s, c + rexw, a0, a2);
2548        }
2549        break;
2550
2551    OP_32_64(andc):
2552        if (const_a2) {
2553            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2554            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2555        } else {
2556            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2557        }
2558        break;
2559
2560    OP_32_64(mul):
2561        if (const_a2) {
2562            int32_t val;
2563            val = a2;
2564            if (val == (int8_t)val) {
2565                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2566                tcg_out8(s, val);
2567            } else {
2568                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2569                tcg_out32(s, val);
2570            }
2571        } else {
2572            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2573        }
2574        break;
2575
2576    OP_32_64(div2):
2577        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2578        break;
2579    OP_32_64(divu2):
2580        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2581        break;
2582
2583    OP_32_64(shl):
2584        /* For small constant 3-operand shift, use LEA.  */
2585        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2586            if (a2 - 1 == 0) {
2587                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2588                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2589            } else {
2590                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2591                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2592            }
2593            break;
2594        }
2595        c = SHIFT_SHL;
2596        vexop = OPC_SHLX;
2597        goto gen_shift_maybe_vex;
2598    OP_32_64(shr):
2599        c = SHIFT_SHR;
2600        vexop = OPC_SHRX;
2601        goto gen_shift_maybe_vex;
2602    OP_32_64(sar):
2603        c = SHIFT_SAR;
2604        vexop = OPC_SARX;
2605        goto gen_shift_maybe_vex;
2606    OP_32_64(rotl):
2607        c = SHIFT_ROL;
2608        goto gen_shift;
2609    OP_32_64(rotr):
2610        c = SHIFT_ROR;
2611        goto gen_shift;
2612    gen_shift_maybe_vex:
2613        if (have_bmi2) {
2614            if (!const_a2) {
2615                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2616                break;
2617            }
2618            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2619        }
2620        /* FALLTHRU */
2621    gen_shift:
2622        if (const_a2) {
2623            tcg_out_shifti(s, c + rexw, a0, a2);
2624        } else {
2625            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2626        }
2627        break;
2628
2629    OP_32_64(ctz):
2630        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2631        break;
2632    OP_32_64(clz):
2633        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2634        break;
2635    OP_32_64(ctpop):
2636        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2637        break;
2638
2639    OP_32_64(brcond):
2640        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2641                       arg_label(args[3]), 0);
2642        break;
2643    OP_32_64(setcond):
2644        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2645        break;
2646    OP_32_64(negsetcond):
2647        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2648        break;
2649    OP_32_64(movcond):
2650        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2651        break;
2652
2653    OP_32_64(bswap16):
2654        if (a2 & TCG_BSWAP_OS) {
2655            /* Output must be sign-extended. */
2656            if (rexw) {
2657                tcg_out_bswap64(s, a0);
2658                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2659            } else {
2660                tcg_out_bswap32(s, a0);
2661                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2662            }
2663        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2664            /* Output must be zero-extended, but input isn't. */
2665            tcg_out_bswap32(s, a0);
2666            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2667        } else {
2668            tcg_out_rolw_8(s, a0);
2669        }
2670        break;
2671    OP_32_64(bswap32):
2672        tcg_out_bswap32(s, a0);
2673        if (rexw && (a2 & TCG_BSWAP_OS)) {
2674            tcg_out_ext32s(s, a0, a0);
2675        }
2676        break;
2677
2678    OP_32_64(neg):
2679        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2680        break;
2681    OP_32_64(not):
2682        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2683        break;
2684
2685    case INDEX_op_qemu_ld_a64_i32:
2686        if (TCG_TARGET_REG_BITS == 32) {
2687            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2688            break;
2689        }
2690        /* fall through */
2691    case INDEX_op_qemu_ld_a32_i32:
2692        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2693        break;
2694    case INDEX_op_qemu_ld_a32_i64:
2695        if (TCG_TARGET_REG_BITS == 64) {
2696            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2697        } else {
2698            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2699        }
2700        break;
2701    case INDEX_op_qemu_ld_a64_i64:
2702        if (TCG_TARGET_REG_BITS == 64) {
2703            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2704        } else {
2705            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2706        }
2707        break;
2708    case INDEX_op_qemu_ld_a32_i128:
2709    case INDEX_op_qemu_ld_a64_i128:
2710        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2711        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2712        break;
2713
2714    case INDEX_op_qemu_st_a64_i32:
2715    case INDEX_op_qemu_st8_a64_i32:
2716        if (TCG_TARGET_REG_BITS == 32) {
2717            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2718            break;
2719        }
2720        /* fall through */
2721    case INDEX_op_qemu_st_a32_i32:
2722    case INDEX_op_qemu_st8_a32_i32:
2723        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2724        break;
2725    case INDEX_op_qemu_st_a32_i64:
2726        if (TCG_TARGET_REG_BITS == 64) {
2727            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2728        } else {
2729            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2730        }
2731        break;
2732    case INDEX_op_qemu_st_a64_i64:
2733        if (TCG_TARGET_REG_BITS == 64) {
2734            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2735        } else {
2736            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2737        }
2738        break;
2739    case INDEX_op_qemu_st_a32_i128:
2740    case INDEX_op_qemu_st_a64_i128:
2741        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2742        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2743        break;
2744
2745    OP_32_64(mulu2):
2746        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2747        break;
2748    OP_32_64(muls2):
2749        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2750        break;
2751    OP_32_64(add2):
2752        if (const_args[4]) {
2753            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2754        } else {
2755            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2756        }
2757        if (const_args[5]) {
2758            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2759        } else {
2760            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2761        }
2762        break;
2763    OP_32_64(sub2):
2764        if (const_args[4]) {
2765            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2766        } else {
2767            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2768        }
2769        if (const_args[5]) {
2770            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2771        } else {
2772            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2773        }
2774        break;
2775
2776#if TCG_TARGET_REG_BITS == 32
2777    case INDEX_op_brcond2_i32:
2778        tcg_out_brcond2(s, args, const_args, 0);
2779        break;
2780    case INDEX_op_setcond2_i32:
2781        tcg_out_setcond2(s, args, const_args);
2782        break;
2783#else /* TCG_TARGET_REG_BITS == 64 */
2784    case INDEX_op_ld32s_i64:
2785        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2786        break;
2787    case INDEX_op_ld_i64:
2788        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2789        break;
2790    case INDEX_op_st_i64:
2791        if (const_args[0]) {
2792            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2793            tcg_out32(s, a0);
2794        } else {
2795            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2796        }
2797        break;
2798
2799    case INDEX_op_bswap64_i64:
2800        tcg_out_bswap64(s, a0);
2801        break;
2802    case INDEX_op_extrh_i64_i32:
2803        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2804        break;
2805#endif
2806
2807    OP_32_64(deposit):
2808        if (args[3] == 0 && args[4] == 8) {
2809            /* load bits 0..7 */
2810            if (const_a2) {
2811                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2812                            0, a0, 0);
2813                tcg_out8(s, a2);
2814            } else {
2815                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2816            }
2817        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2818            /* load bits 8..15 */
2819            if (const_a2) {
2820                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2821                tcg_out8(s, a2);
2822            } else {
2823                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2824            }
2825        } else if (args[3] == 0 && args[4] == 16) {
2826            /* load bits 0..15 */
2827            if (const_a2) {
2828                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2829                            0, a0, 0);
2830                tcg_out16(s, a2);
2831            } else {
2832                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2833            }
2834        } else {
2835            g_assert_not_reached();
2836        }
2837        break;
2838
2839    case INDEX_op_extract_i64:
2840        if (a2 + args[3] == 32) {
2841            /* This is a 32-bit zero-extending right shift.  */
2842            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2843            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2844            break;
2845        }
2846        /* FALLTHRU */
2847    case INDEX_op_extract_i32:
2848        /* On the off-chance that we can use the high-byte registers.
2849           Otherwise we emit the same ext16 + shift pattern that we
2850           would have gotten from the normal tcg-op.c expansion.  */
2851        tcg_debug_assert(a2 == 8 && args[3] == 8);
2852        if (a1 < 4 && a0 < 8) {
2853            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2854        } else {
2855            tcg_out_ext16u(s, a0, a1);
2856            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2857        }
2858        break;
2859
2860    case INDEX_op_sextract_i32:
2861        /* We don't implement sextract_i64, as we cannot sign-extend to
2862           64-bits without using the REX prefix that explicitly excludes
2863           access to the high-byte registers.  */
2864        tcg_debug_assert(a2 == 8 && args[3] == 8);
2865        if (a1 < 4 && a0 < 8) {
2866            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2867        } else {
2868            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2869            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2870        }
2871        break;
2872
2873    OP_32_64(extract2):
2874        /* Note that SHRD outputs to the r/m operand.  */
2875        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2876        tcg_out8(s, args[3]);
2877        break;
2878
2879    case INDEX_op_mb:
2880        tcg_out_mb(s, a0);
2881        break;
2882    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2883    case INDEX_op_mov_i64:
2884    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2885    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2886    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2887    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2888    case INDEX_op_ext8s_i64:
2889    case INDEX_op_ext8u_i32:
2890    case INDEX_op_ext8u_i64:
2891    case INDEX_op_ext16s_i32:
2892    case INDEX_op_ext16s_i64:
2893    case INDEX_op_ext16u_i32:
2894    case INDEX_op_ext16u_i64:
2895    case INDEX_op_ext32s_i64:
2896    case INDEX_op_ext32u_i64:
2897    case INDEX_op_ext_i32_i64:
2898    case INDEX_op_extu_i32_i64:
2899    case INDEX_op_extrl_i64_i32:
2900    default:
2901        g_assert_not_reached();
2902    }
2903
2904#undef OP_32_64
2905}
2906
2907static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2908                           unsigned vecl, unsigned vece,
2909                           const TCGArg args[TCG_MAX_OP_ARGS],
2910                           const int const_args[TCG_MAX_OP_ARGS])
2911{
2912    static int const add_insn[4] = {
2913        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2914    };
2915    static int const ssadd_insn[4] = {
2916        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2917    };
2918    static int const usadd_insn[4] = {
2919        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2920    };
2921    static int const sub_insn[4] = {
2922        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2923    };
2924    static int const sssub_insn[4] = {
2925        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2926    };
2927    static int const ussub_insn[4] = {
2928        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2929    };
2930    static int const mul_insn[4] = {
2931        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2932    };
2933    static int const shift_imm_insn[4] = {
2934        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2935    };
2936    static int const cmpeq_insn[4] = {
2937        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2938    };
2939    static int const cmpgt_insn[4] = {
2940        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2941    };
2942    static int const punpckl_insn[4] = {
2943        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2944    };
2945    static int const punpckh_insn[4] = {
2946        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2947    };
2948    static int const packss_insn[4] = {
2949        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2950    };
2951    static int const packus_insn[4] = {
2952        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2953    };
2954    static int const smin_insn[4] = {
2955        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2956    };
2957    static int const smax_insn[4] = {
2958        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2959    };
2960    static int const umin_insn[4] = {
2961        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2962    };
2963    static int const umax_insn[4] = {
2964        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2965    };
2966    static int const rotlv_insn[4] = {
2967        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2968    };
2969    static int const rotrv_insn[4] = {
2970        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2971    };
2972    static int const shlv_insn[4] = {
2973        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2974    };
2975    static int const shrv_insn[4] = {
2976        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2977    };
2978    static int const sarv_insn[4] = {
2979        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2980    };
2981    static int const shls_insn[4] = {
2982        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2983    };
2984    static int const shrs_insn[4] = {
2985        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2986    };
2987    static int const sars_insn[4] = {
2988        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2989    };
2990    static int const vpshldi_insn[4] = {
2991        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2992    };
2993    static int const vpshldv_insn[4] = {
2994        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2995    };
2996    static int const vpshrdv_insn[4] = {
2997        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2998    };
2999    static int const abs_insn[4] = {
3000        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3001    };
3002
3003    TCGType type = vecl + TCG_TYPE_V64;
3004    int insn, sub;
3005    TCGArg a0, a1, a2, a3;
3006
3007    a0 = args[0];
3008    a1 = args[1];
3009    a2 = args[2];
3010
3011    switch (opc) {
3012    case INDEX_op_add_vec:
3013        insn = add_insn[vece];
3014        goto gen_simd;
3015    case INDEX_op_ssadd_vec:
3016        insn = ssadd_insn[vece];
3017        goto gen_simd;
3018    case INDEX_op_usadd_vec:
3019        insn = usadd_insn[vece];
3020        goto gen_simd;
3021    case INDEX_op_sub_vec:
3022        insn = sub_insn[vece];
3023        goto gen_simd;
3024    case INDEX_op_sssub_vec:
3025        insn = sssub_insn[vece];
3026        goto gen_simd;
3027    case INDEX_op_ussub_vec:
3028        insn = ussub_insn[vece];
3029        goto gen_simd;
3030    case INDEX_op_mul_vec:
3031        insn = mul_insn[vece];
3032        goto gen_simd;
3033    case INDEX_op_and_vec:
3034        insn = OPC_PAND;
3035        goto gen_simd;
3036    case INDEX_op_or_vec:
3037        insn = OPC_POR;
3038        goto gen_simd;
3039    case INDEX_op_xor_vec:
3040        insn = OPC_PXOR;
3041        goto gen_simd;
3042    case INDEX_op_smin_vec:
3043        insn = smin_insn[vece];
3044        goto gen_simd;
3045    case INDEX_op_umin_vec:
3046        insn = umin_insn[vece];
3047        goto gen_simd;
3048    case INDEX_op_smax_vec:
3049        insn = smax_insn[vece];
3050        goto gen_simd;
3051    case INDEX_op_umax_vec:
3052        insn = umax_insn[vece];
3053        goto gen_simd;
3054    case INDEX_op_shlv_vec:
3055        insn = shlv_insn[vece];
3056        goto gen_simd;
3057    case INDEX_op_shrv_vec:
3058        insn = shrv_insn[vece];
3059        goto gen_simd;
3060    case INDEX_op_sarv_vec:
3061        insn = sarv_insn[vece];
3062        goto gen_simd;
3063    case INDEX_op_rotlv_vec:
3064        insn = rotlv_insn[vece];
3065        goto gen_simd;
3066    case INDEX_op_rotrv_vec:
3067        insn = rotrv_insn[vece];
3068        goto gen_simd;
3069    case INDEX_op_shls_vec:
3070        insn = shls_insn[vece];
3071        goto gen_simd;
3072    case INDEX_op_shrs_vec:
3073        insn = shrs_insn[vece];
3074        goto gen_simd;
3075    case INDEX_op_sars_vec:
3076        insn = sars_insn[vece];
3077        goto gen_simd;
3078    case INDEX_op_x86_punpckl_vec:
3079        insn = punpckl_insn[vece];
3080        goto gen_simd;
3081    case INDEX_op_x86_punpckh_vec:
3082        insn = punpckh_insn[vece];
3083        goto gen_simd;
3084    case INDEX_op_x86_packss_vec:
3085        insn = packss_insn[vece];
3086        goto gen_simd;
3087    case INDEX_op_x86_packus_vec:
3088        insn = packus_insn[vece];
3089        goto gen_simd;
3090    case INDEX_op_x86_vpshldv_vec:
3091        insn = vpshldv_insn[vece];
3092        a1 = a2;
3093        a2 = args[3];
3094        goto gen_simd;
3095    case INDEX_op_x86_vpshrdv_vec:
3096        insn = vpshrdv_insn[vece];
3097        a1 = a2;
3098        a2 = args[3];
3099        goto gen_simd;
3100#if TCG_TARGET_REG_BITS == 32
3101    case INDEX_op_dup2_vec:
3102        /* First merge the two 32-bit inputs to a single 64-bit element. */
3103        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3104        /* Then replicate the 64-bit elements across the rest of the vector. */
3105        if (type != TCG_TYPE_V64) {
3106            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3107        }
3108        break;
3109#endif
3110    case INDEX_op_abs_vec:
3111        insn = abs_insn[vece];
3112        a2 = a1;
3113        a1 = 0;
3114        goto gen_simd;
3115    gen_simd:
3116        tcg_debug_assert(insn != OPC_UD2);
3117        if (type == TCG_TYPE_V256) {
3118            insn |= P_VEXL;
3119        }
3120        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3121        break;
3122
3123    case INDEX_op_cmp_vec:
3124        sub = args[3];
3125        if (sub == TCG_COND_EQ) {
3126            insn = cmpeq_insn[vece];
3127        } else if (sub == TCG_COND_GT) {
3128            insn = cmpgt_insn[vece];
3129        } else {
3130            g_assert_not_reached();
3131        }
3132        goto gen_simd;
3133
3134    case INDEX_op_andc_vec:
3135        insn = OPC_PANDN;
3136        if (type == TCG_TYPE_V256) {
3137            insn |= P_VEXL;
3138        }
3139        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3140        break;
3141
3142    case INDEX_op_shli_vec:
3143        insn = shift_imm_insn[vece];
3144        sub = 6;
3145        goto gen_shift;
3146    case INDEX_op_shri_vec:
3147        insn = shift_imm_insn[vece];
3148        sub = 2;
3149        goto gen_shift;
3150    case INDEX_op_sari_vec:
3151        if (vece == MO_64) {
3152            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3153        } else {
3154            insn = shift_imm_insn[vece];
3155        }
3156        sub = 4;
3157        goto gen_shift;
3158    case INDEX_op_rotli_vec:
3159        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3160        if (vece == MO_64) {
3161            insn |= P_VEXW;
3162        }
3163        sub = 1;
3164        goto gen_shift;
3165    gen_shift:
3166        tcg_debug_assert(vece != MO_8);
3167        if (type == TCG_TYPE_V256) {
3168            insn |= P_VEXL;
3169        }
3170        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3171        tcg_out8(s, a2);
3172        break;
3173
3174    case INDEX_op_ld_vec:
3175        tcg_out_ld(s, type, a0, a1, a2);
3176        break;
3177    case INDEX_op_st_vec:
3178        tcg_out_st(s, type, a0, a1, a2);
3179        break;
3180    case INDEX_op_dupm_vec:
3181        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3182        break;
3183
3184    case INDEX_op_x86_shufps_vec:
3185        insn = OPC_SHUFPS;
3186        sub = args[3];
3187        goto gen_simd_imm8;
3188    case INDEX_op_x86_blend_vec:
3189        if (vece == MO_16) {
3190            insn = OPC_PBLENDW;
3191        } else if (vece == MO_32) {
3192            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3193        } else {
3194            g_assert_not_reached();
3195        }
3196        sub = args[3];
3197        goto gen_simd_imm8;
3198    case INDEX_op_x86_vperm2i128_vec:
3199        insn = OPC_VPERM2I128;
3200        sub = args[3];
3201        goto gen_simd_imm8;
3202    case INDEX_op_x86_vpshldi_vec:
3203        insn = vpshldi_insn[vece];
3204        sub = args[3];
3205        goto gen_simd_imm8;
3206
3207    case INDEX_op_not_vec:
3208        insn = OPC_VPTERNLOGQ;
3209        a2 = a1;
3210        sub = 0x33; /* !B */
3211        goto gen_simd_imm8;
3212    case INDEX_op_nor_vec:
3213        insn = OPC_VPTERNLOGQ;
3214        sub = 0x11; /* norCB */
3215        goto gen_simd_imm8;
3216    case INDEX_op_nand_vec:
3217        insn = OPC_VPTERNLOGQ;
3218        sub = 0x77; /* nandCB */
3219        goto gen_simd_imm8;
3220    case INDEX_op_eqv_vec:
3221        insn = OPC_VPTERNLOGQ;
3222        sub = 0x99; /* xnorCB */
3223        goto gen_simd_imm8;
3224    case INDEX_op_orc_vec:
3225        insn = OPC_VPTERNLOGQ;
3226        sub = 0xdd; /* orB!C */
3227        goto gen_simd_imm8;
3228
3229    case INDEX_op_bitsel_vec:
3230        insn = OPC_VPTERNLOGQ;
3231        a3 = args[3];
3232        if (a0 == a1) {
3233            a1 = a2;
3234            a2 = a3;
3235            sub = 0xca; /* A?B:C */
3236        } else if (a0 == a2) {
3237            a2 = a3;
3238            sub = 0xe2; /* B?A:C */
3239        } else {
3240            tcg_out_mov(s, type, a0, a3);
3241            sub = 0xb8; /* B?C:A */
3242        }
3243        goto gen_simd_imm8;
3244
3245    gen_simd_imm8:
3246        tcg_debug_assert(insn != OPC_UD2);
3247        if (type == TCG_TYPE_V256) {
3248            insn |= P_VEXL;
3249        }
3250        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3251        tcg_out8(s, sub);
3252        break;
3253
3254    case INDEX_op_x86_vpblendvb_vec:
3255        insn = OPC_VPBLENDVB;
3256        if (type == TCG_TYPE_V256) {
3257            insn |= P_VEXL;
3258        }
3259        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3260        tcg_out8(s, args[3] << 4);
3261        break;
3262
3263    case INDEX_op_x86_psrldq_vec:
3264        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3265        tcg_out8(s, a2);
3266        break;
3267
3268    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3269    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3270    default:
3271        g_assert_not_reached();
3272    }
3273}
3274
3275static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3276{
3277    switch (op) {
3278    case INDEX_op_goto_ptr:
3279        return C_O0_I1(r);
3280
3281    case INDEX_op_ld8u_i32:
3282    case INDEX_op_ld8u_i64:
3283    case INDEX_op_ld8s_i32:
3284    case INDEX_op_ld8s_i64:
3285    case INDEX_op_ld16u_i32:
3286    case INDEX_op_ld16u_i64:
3287    case INDEX_op_ld16s_i32:
3288    case INDEX_op_ld16s_i64:
3289    case INDEX_op_ld_i32:
3290    case INDEX_op_ld32u_i64:
3291    case INDEX_op_ld32s_i64:
3292    case INDEX_op_ld_i64:
3293        return C_O1_I1(r, r);
3294
3295    case INDEX_op_st8_i32:
3296    case INDEX_op_st8_i64:
3297        return C_O0_I2(qi, r);
3298
3299    case INDEX_op_st16_i32:
3300    case INDEX_op_st16_i64:
3301    case INDEX_op_st_i32:
3302    case INDEX_op_st32_i64:
3303        return C_O0_I2(ri, r);
3304
3305    case INDEX_op_st_i64:
3306        return C_O0_I2(re, r);
3307
3308    case INDEX_op_add_i32:
3309    case INDEX_op_add_i64:
3310        return C_O1_I2(r, r, re);
3311
3312    case INDEX_op_sub_i32:
3313    case INDEX_op_sub_i64:
3314    case INDEX_op_mul_i32:
3315    case INDEX_op_mul_i64:
3316    case INDEX_op_or_i32:
3317    case INDEX_op_or_i64:
3318    case INDEX_op_xor_i32:
3319    case INDEX_op_xor_i64:
3320        return C_O1_I2(r, 0, re);
3321
3322    case INDEX_op_and_i32:
3323    case INDEX_op_and_i64:
3324        return C_O1_I2(r, 0, reZ);
3325
3326    case INDEX_op_andc_i32:
3327    case INDEX_op_andc_i64:
3328        return C_O1_I2(r, r, rI);
3329
3330    case INDEX_op_shl_i32:
3331    case INDEX_op_shl_i64:
3332    case INDEX_op_shr_i32:
3333    case INDEX_op_shr_i64:
3334    case INDEX_op_sar_i32:
3335    case INDEX_op_sar_i64:
3336        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3337
3338    case INDEX_op_rotl_i32:
3339    case INDEX_op_rotl_i64:
3340    case INDEX_op_rotr_i32:
3341    case INDEX_op_rotr_i64:
3342        return C_O1_I2(r, 0, ci);
3343
3344    case INDEX_op_brcond_i32:
3345    case INDEX_op_brcond_i64:
3346        return C_O0_I2(r, re);
3347
3348    case INDEX_op_bswap16_i32:
3349    case INDEX_op_bswap16_i64:
3350    case INDEX_op_bswap32_i32:
3351    case INDEX_op_bswap32_i64:
3352    case INDEX_op_bswap64_i64:
3353    case INDEX_op_neg_i32:
3354    case INDEX_op_neg_i64:
3355    case INDEX_op_not_i32:
3356    case INDEX_op_not_i64:
3357    case INDEX_op_extrh_i64_i32:
3358        return C_O1_I1(r, 0);
3359
3360    case INDEX_op_ext8s_i32:
3361    case INDEX_op_ext8s_i64:
3362    case INDEX_op_ext8u_i32:
3363    case INDEX_op_ext8u_i64:
3364        return C_O1_I1(r, q);
3365
3366    case INDEX_op_ext16s_i32:
3367    case INDEX_op_ext16s_i64:
3368    case INDEX_op_ext16u_i32:
3369    case INDEX_op_ext16u_i64:
3370    case INDEX_op_ext32s_i64:
3371    case INDEX_op_ext32u_i64:
3372    case INDEX_op_ext_i32_i64:
3373    case INDEX_op_extu_i32_i64:
3374    case INDEX_op_extrl_i64_i32:
3375    case INDEX_op_extract_i32:
3376    case INDEX_op_extract_i64:
3377    case INDEX_op_sextract_i32:
3378    case INDEX_op_ctpop_i32:
3379    case INDEX_op_ctpop_i64:
3380        return C_O1_I1(r, r);
3381
3382    case INDEX_op_extract2_i32:
3383    case INDEX_op_extract2_i64:
3384        return C_O1_I2(r, 0, r);
3385
3386    case INDEX_op_deposit_i32:
3387    case INDEX_op_deposit_i64:
3388        return C_O1_I2(q, 0, qi);
3389
3390    case INDEX_op_setcond_i32:
3391    case INDEX_op_setcond_i64:
3392    case INDEX_op_negsetcond_i32:
3393    case INDEX_op_negsetcond_i64:
3394        return C_O1_I2(q, r, re);
3395
3396    case INDEX_op_movcond_i32:
3397    case INDEX_op_movcond_i64:
3398        return C_O1_I4(r, r, re, r, 0);
3399
3400    case INDEX_op_div2_i32:
3401    case INDEX_op_div2_i64:
3402    case INDEX_op_divu2_i32:
3403    case INDEX_op_divu2_i64:
3404        return C_O2_I3(a, d, 0, 1, r);
3405
3406    case INDEX_op_mulu2_i32:
3407    case INDEX_op_mulu2_i64:
3408    case INDEX_op_muls2_i32:
3409    case INDEX_op_muls2_i64:
3410        return C_O2_I2(a, d, a, r);
3411
3412    case INDEX_op_add2_i32:
3413    case INDEX_op_add2_i64:
3414    case INDEX_op_sub2_i32:
3415    case INDEX_op_sub2_i64:
3416        return C_N1_O1_I4(r, r, 0, 1, re, re);
3417
3418    case INDEX_op_ctz_i32:
3419    case INDEX_op_ctz_i64:
3420        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3421
3422    case INDEX_op_clz_i32:
3423    case INDEX_op_clz_i64:
3424        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3425
3426    case INDEX_op_qemu_ld_a32_i32:
3427        return C_O1_I1(r, L);
3428    case INDEX_op_qemu_ld_a64_i32:
3429        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3430
3431    case INDEX_op_qemu_st_a32_i32:
3432        return C_O0_I2(L, L);
3433    case INDEX_op_qemu_st_a64_i32:
3434        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3435    case INDEX_op_qemu_st8_a32_i32:
3436        return C_O0_I2(s, L);
3437    case INDEX_op_qemu_st8_a64_i32:
3438        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3439
3440    case INDEX_op_qemu_ld_a32_i64:
3441        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3442    case INDEX_op_qemu_ld_a64_i64:
3443        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3444
3445    case INDEX_op_qemu_st_a32_i64:
3446        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3447    case INDEX_op_qemu_st_a64_i64:
3448        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3449
3450    case INDEX_op_qemu_ld_a32_i128:
3451    case INDEX_op_qemu_ld_a64_i128:
3452        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3453        return C_O2_I1(r, r, L);
3454    case INDEX_op_qemu_st_a32_i128:
3455    case INDEX_op_qemu_st_a64_i128:
3456        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3457        return C_O0_I3(L, L, L);
3458
3459    case INDEX_op_brcond2_i32:
3460        return C_O0_I4(r, r, ri, ri);
3461
3462    case INDEX_op_setcond2_i32:
3463        return C_O1_I4(r, r, r, ri, ri);
3464
3465    case INDEX_op_ld_vec:
3466    case INDEX_op_dupm_vec:
3467        return C_O1_I1(x, r);
3468
3469    case INDEX_op_st_vec:
3470        return C_O0_I2(x, r);
3471
3472    case INDEX_op_add_vec:
3473    case INDEX_op_sub_vec:
3474    case INDEX_op_mul_vec:
3475    case INDEX_op_and_vec:
3476    case INDEX_op_or_vec:
3477    case INDEX_op_xor_vec:
3478    case INDEX_op_andc_vec:
3479    case INDEX_op_orc_vec:
3480    case INDEX_op_nand_vec:
3481    case INDEX_op_nor_vec:
3482    case INDEX_op_eqv_vec:
3483    case INDEX_op_ssadd_vec:
3484    case INDEX_op_usadd_vec:
3485    case INDEX_op_sssub_vec:
3486    case INDEX_op_ussub_vec:
3487    case INDEX_op_smin_vec:
3488    case INDEX_op_umin_vec:
3489    case INDEX_op_smax_vec:
3490    case INDEX_op_umax_vec:
3491    case INDEX_op_shlv_vec:
3492    case INDEX_op_shrv_vec:
3493    case INDEX_op_sarv_vec:
3494    case INDEX_op_rotlv_vec:
3495    case INDEX_op_rotrv_vec:
3496    case INDEX_op_shls_vec:
3497    case INDEX_op_shrs_vec:
3498    case INDEX_op_sars_vec:
3499    case INDEX_op_cmp_vec:
3500    case INDEX_op_x86_shufps_vec:
3501    case INDEX_op_x86_blend_vec:
3502    case INDEX_op_x86_packss_vec:
3503    case INDEX_op_x86_packus_vec:
3504    case INDEX_op_x86_vperm2i128_vec:
3505    case INDEX_op_x86_punpckl_vec:
3506    case INDEX_op_x86_punpckh_vec:
3507    case INDEX_op_x86_vpshldi_vec:
3508#if TCG_TARGET_REG_BITS == 32
3509    case INDEX_op_dup2_vec:
3510#endif
3511        return C_O1_I2(x, x, x);
3512
3513    case INDEX_op_abs_vec:
3514    case INDEX_op_dup_vec:
3515    case INDEX_op_not_vec:
3516    case INDEX_op_shli_vec:
3517    case INDEX_op_shri_vec:
3518    case INDEX_op_sari_vec:
3519    case INDEX_op_rotli_vec:
3520    case INDEX_op_x86_psrldq_vec:
3521        return C_O1_I1(x, x);
3522
3523    case INDEX_op_x86_vpshldv_vec:
3524    case INDEX_op_x86_vpshrdv_vec:
3525        return C_O1_I3(x, 0, x, x);
3526
3527    case INDEX_op_bitsel_vec:
3528    case INDEX_op_x86_vpblendvb_vec:
3529        return C_O1_I3(x, x, x, x);
3530
3531    default:
3532        g_assert_not_reached();
3533    }
3534}
3535
3536int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3537{
3538    switch (opc) {
3539    case INDEX_op_add_vec:
3540    case INDEX_op_sub_vec:
3541    case INDEX_op_and_vec:
3542    case INDEX_op_or_vec:
3543    case INDEX_op_xor_vec:
3544    case INDEX_op_andc_vec:
3545    case INDEX_op_orc_vec:
3546    case INDEX_op_nand_vec:
3547    case INDEX_op_nor_vec:
3548    case INDEX_op_eqv_vec:
3549    case INDEX_op_not_vec:
3550    case INDEX_op_bitsel_vec:
3551        return 1;
3552    case INDEX_op_cmp_vec:
3553    case INDEX_op_cmpsel_vec:
3554        return -1;
3555
3556    case INDEX_op_rotli_vec:
3557        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3558
3559    case INDEX_op_shli_vec:
3560    case INDEX_op_shri_vec:
3561        /* We must expand the operation for MO_8.  */
3562        return vece == MO_8 ? -1 : 1;
3563
3564    case INDEX_op_sari_vec:
3565        switch (vece) {
3566        case MO_8:
3567            return -1;
3568        case MO_16:
3569        case MO_32:
3570            return 1;
3571        case MO_64:
3572            if (have_avx512vl) {
3573                return 1;
3574            }
3575            /*
3576             * We can emulate this for MO_64, but it does not pay off
3577             * unless we're producing at least 4 values.
3578             */
3579            return type >= TCG_TYPE_V256 ? -1 : 0;
3580        }
3581        return 0;
3582
3583    case INDEX_op_shls_vec:
3584    case INDEX_op_shrs_vec:
3585        return vece >= MO_16;
3586    case INDEX_op_sars_vec:
3587        switch (vece) {
3588        case MO_16:
3589        case MO_32:
3590            return 1;
3591        case MO_64:
3592            return have_avx512vl;
3593        }
3594        return 0;
3595    case INDEX_op_rotls_vec:
3596        return vece >= MO_16 ? -1 : 0;
3597
3598    case INDEX_op_shlv_vec:
3599    case INDEX_op_shrv_vec:
3600        switch (vece) {
3601        case MO_16:
3602            return have_avx512bw;
3603        case MO_32:
3604        case MO_64:
3605            return have_avx2;
3606        }
3607        return 0;
3608    case INDEX_op_sarv_vec:
3609        switch (vece) {
3610        case MO_16:
3611            return have_avx512bw;
3612        case MO_32:
3613            return have_avx2;
3614        case MO_64:
3615            return have_avx512vl;
3616        }
3617        return 0;
3618    case INDEX_op_rotlv_vec:
3619    case INDEX_op_rotrv_vec:
3620        switch (vece) {
3621        case MO_16:
3622            return have_avx512vbmi2 ? -1 : 0;
3623        case MO_32:
3624        case MO_64:
3625            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3626        }
3627        return 0;
3628
3629    case INDEX_op_mul_vec:
3630        switch (vece) {
3631        case MO_8:
3632            return -1;
3633        case MO_64:
3634            return have_avx512dq;
3635        }
3636        return 1;
3637
3638    case INDEX_op_ssadd_vec:
3639    case INDEX_op_usadd_vec:
3640    case INDEX_op_sssub_vec:
3641    case INDEX_op_ussub_vec:
3642        return vece <= MO_16;
3643    case INDEX_op_smin_vec:
3644    case INDEX_op_smax_vec:
3645    case INDEX_op_umin_vec:
3646    case INDEX_op_umax_vec:
3647    case INDEX_op_abs_vec:
3648        return vece <= MO_32 || have_avx512vl;
3649
3650    default:
3651        return 0;
3652    }
3653}
3654
3655static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3656                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3657{
3658    TCGv_vec t1, t2;
3659
3660    tcg_debug_assert(vece == MO_8);
3661
3662    t1 = tcg_temp_new_vec(type);
3663    t2 = tcg_temp_new_vec(type);
3664
3665    /*
3666     * Unpack to W, shift, and repack.  Tricky bits:
3667     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3668     *     i.e. duplicate in other half of the 16-bit lane.
3669     * (2) For right-shift, add 8 so that the high half of the lane
3670     *     becomes zero.  For left-shift, and left-rotate, we must
3671     *     shift up and down again.
3672     * (3) Step 2 leaves high half zero such that PACKUSWB
3673     *     (pack with unsigned saturation) does not modify
3674     *     the quantity.
3675     */
3676    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3677              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3678    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3679              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3680
3681    if (opc != INDEX_op_rotli_vec) {
3682        imm += 8;
3683    }
3684    if (opc == INDEX_op_shri_vec) {
3685        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3686        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3687    } else {
3688        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3689        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3690        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3691        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3692    }
3693
3694    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3695              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3696    tcg_temp_free_vec(t1);
3697    tcg_temp_free_vec(t2);
3698}
3699
3700static void expand_vec_sari(TCGType type, unsigned vece,
3701                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3702{
3703    TCGv_vec t1, t2;
3704
3705    switch (vece) {
3706    case MO_8:
3707        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3708        t1 = tcg_temp_new_vec(type);
3709        t2 = tcg_temp_new_vec(type);
3710        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3711                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3712        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3713                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3714        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3715        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3716        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3717                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3718        tcg_temp_free_vec(t1);
3719        tcg_temp_free_vec(t2);
3720        break;
3721
3722    case MO_64:
3723        t1 = tcg_temp_new_vec(type);
3724        if (imm <= 32) {
3725            /*
3726             * We can emulate a small sign extend by performing an arithmetic
3727             * 32-bit shift and overwriting the high half of a 64-bit logical
3728             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3729             * does not, so we have to bound the smaller shift -- we get the
3730             * same result in the high half either way.
3731             */
3732            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3733            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3734            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3735                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3736                      tcgv_vec_arg(t1), 0xaa);
3737        } else {
3738            /* Otherwise we will need to use a compare vs 0 to produce
3739             * the sign-extend, shift and merge.
3740             */
3741            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3742                            tcg_constant_vec(type, MO_64, 0), v1);
3743            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3744            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3745            tcg_gen_or_vec(MO_64, v0, v0, t1);
3746        }
3747        tcg_temp_free_vec(t1);
3748        break;
3749
3750    default:
3751        g_assert_not_reached();
3752    }
3753}
3754
3755static void expand_vec_rotli(TCGType type, unsigned vece,
3756                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3757{
3758    TCGv_vec t;
3759
3760    if (vece == MO_8) {
3761        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3762        return;
3763    }
3764
3765    if (have_avx512vbmi2) {
3766        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3767                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3768        return;
3769    }
3770
3771    t = tcg_temp_new_vec(type);
3772    tcg_gen_shli_vec(vece, t, v1, imm);
3773    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3774    tcg_gen_or_vec(vece, v0, v0, t);
3775    tcg_temp_free_vec(t);
3776}
3777
3778static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3779                            TCGv_vec v1, TCGv_vec sh, bool right)
3780{
3781    TCGv_vec t;
3782
3783    if (have_avx512vbmi2) {
3784        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3785                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3786                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3787        return;
3788    }
3789
3790    t = tcg_temp_new_vec(type);
3791    tcg_gen_dupi_vec(vece, t, 8 << vece);
3792    tcg_gen_sub_vec(vece, t, t, sh);
3793    if (right) {
3794        tcg_gen_shlv_vec(vece, t, v1, t);
3795        tcg_gen_shrv_vec(vece, v0, v1, sh);
3796    } else {
3797        tcg_gen_shrv_vec(vece, t, v1, t);
3798        tcg_gen_shlv_vec(vece, v0, v1, sh);
3799    }
3800    tcg_gen_or_vec(vece, v0, v0, t);
3801    tcg_temp_free_vec(t);
3802}
3803
3804static void expand_vec_rotls(TCGType type, unsigned vece,
3805                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3806{
3807    TCGv_vec t = tcg_temp_new_vec(type);
3808
3809    tcg_debug_assert(vece != MO_8);
3810
3811    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3812        tcg_gen_dup_i32_vec(vece, t, lsh);
3813        if (vece >= MO_32) {
3814            tcg_gen_rotlv_vec(vece, v0, v1, t);
3815        } else {
3816            expand_vec_rotv(type, vece, v0, v1, t, false);
3817        }
3818    } else {
3819        TCGv_i32 rsh = tcg_temp_new_i32();
3820
3821        tcg_gen_neg_i32(rsh, lsh);
3822        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3823        tcg_gen_shls_vec(vece, t, v1, lsh);
3824        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3825        tcg_gen_or_vec(vece, v0, v0, t);
3826
3827        tcg_temp_free_i32(rsh);
3828    }
3829
3830    tcg_temp_free_vec(t);
3831}
3832
3833static void expand_vec_mul(TCGType type, unsigned vece,
3834                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3835{
3836    TCGv_vec t1, t2, t3, t4, zero;
3837
3838    tcg_debug_assert(vece == MO_8);
3839
3840    /*
3841     * Unpack v1 bytes to words, 0 | x.
3842     * Unpack v2 bytes to words, y | 0.
3843     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3844     * Shift logical right by 8 bits to clear the high 8 bytes before
3845     * using an unsigned saturated pack.
3846     *
3847     * The difference between the V64, V128 and V256 cases is merely how
3848     * we distribute the expansion between temporaries.
3849     */
3850    switch (type) {
3851    case TCG_TYPE_V64:
3852        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3853        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3854        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3855        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3856                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3857        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3858                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3859        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3860        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3861        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3862                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3863        tcg_temp_free_vec(t1);
3864        tcg_temp_free_vec(t2);
3865        break;
3866
3867    case TCG_TYPE_V128:
3868    case TCG_TYPE_V256:
3869        t1 = tcg_temp_new_vec(type);
3870        t2 = tcg_temp_new_vec(type);
3871        t3 = tcg_temp_new_vec(type);
3872        t4 = tcg_temp_new_vec(type);
3873        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3874        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3875                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3876        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3877                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3878        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3879                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3880        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3881                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3882        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3883        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3884        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3885        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3886        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3887                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3888        tcg_temp_free_vec(t1);
3889        tcg_temp_free_vec(t2);
3890        tcg_temp_free_vec(t3);
3891        tcg_temp_free_vec(t4);
3892        break;
3893
3894    default:
3895        g_assert_not_reached();
3896    }
3897}
3898
3899static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3900                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3901{
3902    enum {
3903        NEED_INV  = 1,
3904        NEED_SWAP = 2,
3905        NEED_BIAS = 4,
3906        NEED_UMIN = 8,
3907        NEED_UMAX = 16,
3908    };
3909    TCGv_vec t1, t2, t3;
3910    uint8_t fixup;
3911
3912    switch (cond) {
3913    case TCG_COND_EQ:
3914    case TCG_COND_GT:
3915        fixup = 0;
3916        break;
3917    case TCG_COND_NE:
3918    case TCG_COND_LE:
3919        fixup = NEED_INV;
3920        break;
3921    case TCG_COND_LT:
3922        fixup = NEED_SWAP;
3923        break;
3924    case TCG_COND_GE:
3925        fixup = NEED_SWAP | NEED_INV;
3926        break;
3927    case TCG_COND_LEU:
3928        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3929            fixup = NEED_UMIN;
3930        } else {
3931            fixup = NEED_BIAS | NEED_INV;
3932        }
3933        break;
3934    case TCG_COND_GTU:
3935        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3936            fixup = NEED_UMIN | NEED_INV;
3937        } else {
3938            fixup = NEED_BIAS;
3939        }
3940        break;
3941    case TCG_COND_GEU:
3942        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3943            fixup = NEED_UMAX;
3944        } else {
3945            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3946        }
3947        break;
3948    case TCG_COND_LTU:
3949        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3950            fixup = NEED_UMAX | NEED_INV;
3951        } else {
3952            fixup = NEED_BIAS | NEED_SWAP;
3953        }
3954        break;
3955    default:
3956        g_assert_not_reached();
3957    }
3958
3959    if (fixup & NEED_INV) {
3960        cond = tcg_invert_cond(cond);
3961    }
3962    if (fixup & NEED_SWAP) {
3963        t1 = v1, v1 = v2, v2 = t1;
3964        cond = tcg_swap_cond(cond);
3965    }
3966
3967    t1 = t2 = NULL;
3968    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3969        t1 = tcg_temp_new_vec(type);
3970        if (fixup & NEED_UMIN) {
3971            tcg_gen_umin_vec(vece, t1, v1, v2);
3972        } else {
3973            tcg_gen_umax_vec(vece, t1, v1, v2);
3974        }
3975        v2 = t1;
3976        cond = TCG_COND_EQ;
3977    } else if (fixup & NEED_BIAS) {
3978        t1 = tcg_temp_new_vec(type);
3979        t2 = tcg_temp_new_vec(type);
3980        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3981        tcg_gen_sub_vec(vece, t1, v1, t3);
3982        tcg_gen_sub_vec(vece, t2, v2, t3);
3983        v1 = t1;
3984        v2 = t2;
3985        cond = tcg_signed_cond(cond);
3986    }
3987
3988    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3989    /* Expand directly; do not recurse.  */
3990    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3991              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3992
3993    if (t1) {
3994        tcg_temp_free_vec(t1);
3995        if (t2) {
3996            tcg_temp_free_vec(t2);
3997        }
3998    }
3999    return fixup & NEED_INV;
4000}
4001
4002static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4003                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4004{
4005    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4006        tcg_gen_not_vec(vece, v0, v0);
4007    }
4008}
4009
4010static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4011                              TCGv_vec c1, TCGv_vec c2,
4012                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4013{
4014    TCGv_vec t = tcg_temp_new_vec(type);
4015
4016    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4017        /* Invert the sense of the compare by swapping arguments.  */
4018        TCGv_vec x;
4019        x = v3, v3 = v4, v4 = x;
4020    }
4021    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4022              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4023              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4024    tcg_temp_free_vec(t);
4025}
4026
4027void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4028                       TCGArg a0, ...)
4029{
4030    va_list va;
4031    TCGArg a2;
4032    TCGv_vec v0, v1, v2, v3, v4;
4033
4034    va_start(va, a0);
4035    v0 = temp_tcgv_vec(arg_temp(a0));
4036    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4037    a2 = va_arg(va, TCGArg);
4038
4039    switch (opc) {
4040    case INDEX_op_shli_vec:
4041    case INDEX_op_shri_vec:
4042        expand_vec_shi(type, vece, opc, v0, v1, a2);
4043        break;
4044
4045    case INDEX_op_sari_vec:
4046        expand_vec_sari(type, vece, v0, v1, a2);
4047        break;
4048
4049    case INDEX_op_rotli_vec:
4050        expand_vec_rotli(type, vece, v0, v1, a2);
4051        break;
4052
4053    case INDEX_op_rotls_vec:
4054        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4055        break;
4056
4057    case INDEX_op_rotlv_vec:
4058        v2 = temp_tcgv_vec(arg_temp(a2));
4059        expand_vec_rotv(type, vece, v0, v1, v2, false);
4060        break;
4061    case INDEX_op_rotrv_vec:
4062        v2 = temp_tcgv_vec(arg_temp(a2));
4063        expand_vec_rotv(type, vece, v0, v1, v2, true);
4064        break;
4065
4066    case INDEX_op_mul_vec:
4067        v2 = temp_tcgv_vec(arg_temp(a2));
4068        expand_vec_mul(type, vece, v0, v1, v2);
4069        break;
4070
4071    case INDEX_op_cmp_vec:
4072        v2 = temp_tcgv_vec(arg_temp(a2));
4073        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4074        break;
4075
4076    case INDEX_op_cmpsel_vec:
4077        v2 = temp_tcgv_vec(arg_temp(a2));
4078        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4079        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4080        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4081        break;
4082
4083    default:
4084        break;
4085    }
4086
4087    va_end(va);
4088}
4089
4090static const int tcg_target_callee_save_regs[] = {
4091#if TCG_TARGET_REG_BITS == 64
4092    TCG_REG_RBP,
4093    TCG_REG_RBX,
4094#if defined(_WIN64)
4095    TCG_REG_RDI,
4096    TCG_REG_RSI,
4097#endif
4098    TCG_REG_R12,
4099    TCG_REG_R13,
4100    TCG_REG_R14, /* Currently used for the global env. */
4101    TCG_REG_R15,
4102#else
4103    TCG_REG_EBP, /* Currently used for the global env. */
4104    TCG_REG_EBX,
4105    TCG_REG_ESI,
4106    TCG_REG_EDI,
4107#endif
4108};
4109
4110/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4111   and tcg_register_jit.  */
4112
4113#define PUSH_SIZE \
4114    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4115     * (TCG_TARGET_REG_BITS / 8))
4116
4117#define FRAME_SIZE \
4118    ((PUSH_SIZE \
4119      + TCG_STATIC_CALL_ARGS_SIZE \
4120      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4121      + TCG_TARGET_STACK_ALIGN - 1) \
4122     & ~(TCG_TARGET_STACK_ALIGN - 1))
4123
4124/* Generate global QEMU prologue and epilogue code */
4125static void tcg_target_qemu_prologue(TCGContext *s)
4126{
4127    int i, stack_addend;
4128
4129    /* TB prologue */
4130
4131    /* Reserve some stack space, also for TCG temps.  */
4132    stack_addend = FRAME_SIZE - PUSH_SIZE;
4133    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4134                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4135
4136    /* Save all callee saved registers.  */
4137    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4138        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4139    }
4140
4141    if (!tcg_use_softmmu && guest_base) {
4142        int seg = setup_guest_base_seg();
4143        if (seg != 0) {
4144            x86_guest_base.seg = seg;
4145        } else if (guest_base == (int32_t)guest_base) {
4146            x86_guest_base.ofs = guest_base;
4147        } else {
4148            assert(TCG_TARGET_REG_BITS == 64);
4149            /* Choose R12 because, as a base, it requires a SIB byte. */
4150            x86_guest_base.index = TCG_REG_R12;
4151            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4152            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4153        }
4154    }
4155
4156    if (TCG_TARGET_REG_BITS == 32) {
4157        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4158                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4159        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4160        /* jmp *tb.  */
4161        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4162                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4163                             + stack_addend);
4164    } else {
4165        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4166        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4167        /* jmp *tb.  */
4168        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4169    }
4170
4171    /*
4172     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4173     * and fall through to the rest of the epilogue.
4174     */
4175    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4176    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4177
4178    /* TB epilogue */
4179    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4180
4181    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4182
4183    if (have_avx2) {
4184        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4185    }
4186    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4187        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4188    }
4189    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4190}
4191
4192static void tcg_out_tb_start(TCGContext *s)
4193{
4194    /* nothing to do */
4195}
4196
4197static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4198{
4199    memset(p, 0x90, count);
4200}
4201
4202static void tcg_target_init(TCGContext *s)
4203{
4204    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4205    if (TCG_TARGET_REG_BITS == 64) {
4206        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4207    }
4208    if (have_avx1) {
4209        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4210        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4211    }
4212    if (have_avx2) {
4213        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4214    }
4215
4216    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4217    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4218    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4219    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4220    if (TCG_TARGET_REG_BITS == 64) {
4221#if !defined(_WIN64)
4222        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4223        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4224#endif
4225        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4226        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4227        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4228        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4229    }
4230
4231    s->reserved_regs = 0;
4232    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4233    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4234#ifdef _WIN64
4235    /* These are call saved, and we don't save them, so don't use them. */
4236    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4237    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4238    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4239    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4240    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4241    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4242    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4243    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4244    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4245    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4246#endif
4247}
4248
4249typedef struct {
4250    DebugFrameHeader h;
4251    uint8_t fde_def_cfa[4];
4252    uint8_t fde_reg_ofs[14];
4253} DebugFrame;
4254
4255/* We're expecting a 2 byte uleb128 encoded value.  */
4256QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4257
4258#if !defined(__ELF__)
4259    /* Host machine without ELF. */
4260#elif TCG_TARGET_REG_BITS == 64
4261#define ELF_HOST_MACHINE EM_X86_64
4262static const DebugFrame debug_frame = {
4263    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4264    .h.cie.id = -1,
4265    .h.cie.version = 1,
4266    .h.cie.code_align = 1,
4267    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4268    .h.cie.return_column = 16,
4269
4270    /* Total FDE size does not include the "len" member.  */
4271    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4272
4273    .fde_def_cfa = {
4274        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4275        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4276        (FRAME_SIZE >> 7)
4277    },
4278    .fde_reg_ofs = {
4279        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4280        /* The following ordering must match tcg_target_callee_save_regs.  */
4281        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4282        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4283        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4284        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4285        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4286        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4287    }
4288};
4289#else
4290#define ELF_HOST_MACHINE EM_386
4291static const DebugFrame debug_frame = {
4292    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4293    .h.cie.id = -1,
4294    .h.cie.version = 1,
4295    .h.cie.code_align = 1,
4296    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4297    .h.cie.return_column = 8,
4298
4299    /* Total FDE size does not include the "len" member.  */
4300    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4301
4302    .fde_def_cfa = {
4303        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4304        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4305        (FRAME_SIZE >> 7)
4306    },
4307    .fde_reg_ofs = {
4308        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4309        /* The following ordering must match tcg_target_callee_save_regs.  */
4310        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4311        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4312        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4313        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4314    }
4315};
4316#endif
4317
4318#if defined(ELF_HOST_MACHINE)
4319void tcg_register_jit(const void *buf, size_t buf_size)
4320{
4321    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4322}
4323#endif
4324