xref: /qemu/tcg/i386/tcg-target.c.inc (revision de6cd759)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135
136/* Registers used with L constraint, which are the first argument
137   registers on x86_64, and two random call clobbered registers on
138   i386. */
139#if TCG_TARGET_REG_BITS == 64
140# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
142#else
143# define TCG_REG_L0 TCG_REG_EAX
144# define TCG_REG_L1 TCG_REG_EDX
145#endif
146
147#define ALL_BYTEH_REGS         0x0000000fu
148#if TCG_TARGET_REG_BITS == 64
149# define ALL_GENERAL_REGS      0x0000ffffu
150# define ALL_VECTOR_REGS       0xffff0000u
151# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152#else
153# define ALL_GENERAL_REGS      0x000000ffu
154# define ALL_VECTOR_REGS       0x00ff0000u
155# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
156#endif
157#ifdef CONFIG_SOFTMMU
158# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
159#else
160# define SOFTMMU_RESERVE_REGS  0
161#endif
162
163/* For 64-bit, we always know that CMOV is available.  */
164#if TCG_TARGET_REG_BITS == 64
165# define have_cmov      true
166#else
167# define have_cmov      (cpuinfo & CPUINFO_CMOV)
168#endif
169#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
170#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
171
172static const tcg_insn_unit *tb_ret_addr;
173
174static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
175                        intptr_t value, intptr_t addend)
176{
177    value += addend;
178    switch(type) {
179    case R_386_PC32:
180        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
181        if (value != (int32_t)value) {
182            return false;
183        }
184        /* FALLTHRU */
185    case R_386_32:
186        tcg_patch32(code_ptr, value);
187        break;
188    case R_386_PC8:
189        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
190        if (value != (int8_t)value) {
191            return false;
192        }
193        tcg_patch8(code_ptr, value);
194        break;
195    default:
196        g_assert_not_reached();
197    }
198    return true;
199}
200
201/* test if a constant matches the constraint */
202static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
203{
204    if (ct & TCG_CT_CONST) {
205        return 1;
206    }
207    if (type == TCG_TYPE_I32) {
208        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
209            return 1;
210        }
211    } else {
212        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
213            return 1;
214        }
215        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
216            return 1;
217        }
218        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
219            return 1;
220        }
221    }
222    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
223        return 1;
224    }
225    return 0;
226}
227
228# define LOWREGMASK(x)	((x) & 7)
229
230#define P_EXT		0x100		/* 0x0f opcode prefix */
231#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
232#define P_DATA16        0x400           /* 0x66 opcode prefix */
233#define P_VEXW          0x1000          /* Set VEX.W = 1 */
234#if TCG_TARGET_REG_BITS == 64
235# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
236# define P_REXB_R       0x2000          /* REG field as byte register */
237# define P_REXB_RM      0x4000          /* R/M field as byte register */
238# define P_GS           0x8000          /* gs segment override */
239#else
240# define P_REXW		0
241# define P_REXB_R	0
242# define P_REXB_RM	0
243# define P_GS           0
244#endif
245#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
246#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
247#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
248#define P_VEXL          0x80000         /* Set VEX.L = 1 */
249#define P_EVEX          0x100000        /* Requires EVEX encoding */
250
251#define OPC_ARITH_EvIz	(0x81)
252#define OPC_ARITH_EvIb	(0x83)
253#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
254#define OPC_ANDN        (0xf2 | P_EXT38)
255#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
256#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
257#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
258#define OPC_BSF         (0xbc | P_EXT)
259#define OPC_BSR         (0xbd | P_EXT)
260#define OPC_BSWAP	(0xc8 | P_EXT)
261#define OPC_CALL_Jz	(0xe8)
262#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
263#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
264#define OPC_DEC_r32	(0x48)
265#define OPC_IMUL_GvEv	(0xaf | P_EXT)
266#define OPC_IMUL_GvEvIb	(0x6b)
267#define OPC_IMUL_GvEvIz	(0x69)
268#define OPC_INC_r32	(0x40)
269#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
270#define OPC_JCC_short	(0x70)		/* ... plus condition code */
271#define OPC_JMP_long	(0xe9)
272#define OPC_JMP_short	(0xeb)
273#define OPC_LEA         (0x8d)
274#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
275#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
276#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
277#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
278#define OPC_MOVB_EvIz   (0xc6)
279#define OPC_MOVL_EvIz	(0xc7)
280#define OPC_MOVL_Iv     (0xb8)
281#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
282#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
283#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
284#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
285#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
286#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
287#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
288#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
289#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
290#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
291#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
292#define OPC_MOVSBL	(0xbe | P_EXT)
293#define OPC_MOVSWL	(0xbf | P_EXT)
294#define OPC_MOVSLQ	(0x63 | P_REXW)
295#define OPC_MOVZBL	(0xb6 | P_EXT)
296#define OPC_MOVZWL	(0xb7 | P_EXT)
297#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
298#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
299#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
300#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
301#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
302#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
303#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
304#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
305#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
306#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
307#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
308#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
309#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
310#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
311#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
312#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
313#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
314#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
315#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
316#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
317#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
318#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
319#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
320#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
321#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
322#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
323#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
324#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
325#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
326#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
327#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
328#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
329#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
330#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
331#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
332#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
333#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
334#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
335#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
336#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
337#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
338#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
339#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
340#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
341#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
342#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
343#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
344#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
345#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
346#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
347#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
348#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
349#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
350#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
351#define OPC_POR         (0xeb | P_EXT | P_DATA16)
352#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
353#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
354#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
355#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
356#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
357#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
358#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
359#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
360#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
361#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
362#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
363#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
364#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
365#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
366#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
367#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
368#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
369#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
370#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
371#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
372#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
373#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
374#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
375#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
376#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
377#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
378#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
379#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
380#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
381#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
382#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
383#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
384#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
385#define OPC_POP_r32	(0x58)
386#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
387#define OPC_PUSH_r32	(0x50)
388#define OPC_PUSH_Iv	(0x68)
389#define OPC_PUSH_Ib	(0x6a)
390#define OPC_RET		(0xc3)
391#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
392#define OPC_SHIFT_1	(0xd1)
393#define OPC_SHIFT_Ib	(0xc1)
394#define OPC_SHIFT_cl	(0xd3)
395#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
396#define OPC_SHUFPS      (0xc6 | P_EXT)
397#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
398#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
399#define OPC_SHRD_Ib     (0xac | P_EXT)
400#define OPC_TESTL	(0x85)
401#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
402#define OPC_UD2         (0x0b | P_EXT)
403#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
404#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
405#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
406#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
407#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
408#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
409#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
410#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
411#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
412#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
413#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
414#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
415#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
416#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
417#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
418#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
419#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
420#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
421#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
422#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
423#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
424#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
427#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
428#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
429#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
430#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
431#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
433#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
434#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
435#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
436#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
437#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
438#define OPC_VZEROUPPER  (0x77 | P_EXT)
439#define OPC_XCHG_ax_r32	(0x90)
440#define OPC_XCHG_EvGv   (0x87)
441
442#define OPC_GRP3_Eb     (0xf6)
443#define OPC_GRP3_Ev     (0xf7)
444#define OPC_GRP5        (0xff)
445#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
446
447/* Group 1 opcode extensions for 0x80-0x83.
448   These are also used as modifiers for OPC_ARITH.  */
449#define ARITH_ADD 0
450#define ARITH_OR  1
451#define ARITH_ADC 2
452#define ARITH_SBB 3
453#define ARITH_AND 4
454#define ARITH_SUB 5
455#define ARITH_XOR 6
456#define ARITH_CMP 7
457
458/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
459#define SHIFT_ROL 0
460#define SHIFT_ROR 1
461#define SHIFT_SHL 4
462#define SHIFT_SHR 5
463#define SHIFT_SAR 7
464
465/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
466#define EXT3_TESTi 0
467#define EXT3_NOT   2
468#define EXT3_NEG   3
469#define EXT3_MUL   4
470#define EXT3_IMUL  5
471#define EXT3_DIV   6
472#define EXT3_IDIV  7
473
474/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
475#define EXT5_INC_Ev	0
476#define EXT5_DEC_Ev	1
477#define EXT5_CALLN_Ev	2
478#define EXT5_JMPN_Ev	4
479
480/* Condition codes to be added to OPC_JCC_{long,short}.  */
481#define JCC_JMP (-1)
482#define JCC_JO  0x0
483#define JCC_JNO 0x1
484#define JCC_JB  0x2
485#define JCC_JAE 0x3
486#define JCC_JE  0x4
487#define JCC_JNE 0x5
488#define JCC_JBE 0x6
489#define JCC_JA  0x7
490#define JCC_JS  0x8
491#define JCC_JNS 0x9
492#define JCC_JP  0xa
493#define JCC_JNP 0xb
494#define JCC_JL  0xc
495#define JCC_JGE 0xd
496#define JCC_JLE 0xe
497#define JCC_JG  0xf
498
499static const uint8_t tcg_cond_to_jcc[] = {
500    [TCG_COND_EQ] = JCC_JE,
501    [TCG_COND_NE] = JCC_JNE,
502    [TCG_COND_LT] = JCC_JL,
503    [TCG_COND_GE] = JCC_JGE,
504    [TCG_COND_LE] = JCC_JLE,
505    [TCG_COND_GT] = JCC_JG,
506    [TCG_COND_LTU] = JCC_JB,
507    [TCG_COND_GEU] = JCC_JAE,
508    [TCG_COND_LEU] = JCC_JBE,
509    [TCG_COND_GTU] = JCC_JA,
510};
511
512#if TCG_TARGET_REG_BITS == 64
513static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
514{
515    int rex;
516
517    if (opc & P_GS) {
518        tcg_out8(s, 0x65);
519    }
520    if (opc & P_DATA16) {
521        /* We should never be asking for both 16 and 64-bit operation.  */
522        tcg_debug_assert((opc & P_REXW) == 0);
523        tcg_out8(s, 0x66);
524    }
525    if (opc & P_SIMDF3) {
526        tcg_out8(s, 0xf3);
527    } else if (opc & P_SIMDF2) {
528        tcg_out8(s, 0xf2);
529    }
530
531    rex = 0;
532    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
533    rex |= (r & 8) >> 1;                /* REX.R */
534    rex |= (x & 8) >> 2;                /* REX.X */
535    rex |= (rm & 8) >> 3;               /* REX.B */
536
537    /* P_REXB_{R,RM} indicates that the given register is the low byte.
538       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
539       as otherwise the encoding indicates %[abcd]h.  Note that the values
540       that are ORed in merely indicate that the REX byte must be present;
541       those bits get discarded in output.  */
542    rex |= opc & (r >= 4 ? P_REXB_R : 0);
543    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
544
545    if (rex) {
546        tcg_out8(s, (uint8_t)(rex | 0x40));
547    }
548
549    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
550        tcg_out8(s, 0x0f);
551        if (opc & P_EXT38) {
552            tcg_out8(s, 0x38);
553        } else if (opc & P_EXT3A) {
554            tcg_out8(s, 0x3a);
555        }
556    }
557
558    tcg_out8(s, opc);
559}
560#else
561static void tcg_out_opc(TCGContext *s, int opc)
562{
563    if (opc & P_DATA16) {
564        tcg_out8(s, 0x66);
565    }
566    if (opc & P_SIMDF3) {
567        tcg_out8(s, 0xf3);
568    } else if (opc & P_SIMDF2) {
569        tcg_out8(s, 0xf2);
570    }
571    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
572        tcg_out8(s, 0x0f);
573        if (opc & P_EXT38) {
574            tcg_out8(s, 0x38);
575        } else if (opc & P_EXT3A) {
576            tcg_out8(s, 0x3a);
577        }
578    }
579    tcg_out8(s, opc);
580}
581/* Discard the register arguments to tcg_out_opc early, so as not to penalize
582   the 32-bit compilation paths.  This method works with all versions of gcc,
583   whereas relying on optimization may not be able to exclude them.  */
584#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
585#endif
586
587static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
588{
589    tcg_out_opc(s, opc, r, rm, 0);
590    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
591}
592
593static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
594                            int rm, int index)
595{
596    int tmp;
597
598    /* Use the two byte form if possible, which cannot encode
599       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
600    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
601        && ((rm | index) & 8) == 0) {
602        /* Two byte VEX prefix.  */
603        tcg_out8(s, 0xc5);
604
605        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
606    } else {
607        /* Three byte VEX prefix.  */
608        tcg_out8(s, 0xc4);
609
610        /* VEX.m-mmmm */
611        if (opc & P_EXT3A) {
612            tmp = 3;
613        } else if (opc & P_EXT38) {
614            tmp = 2;
615        } else if (opc & P_EXT) {
616            tmp = 1;
617        } else {
618            g_assert_not_reached();
619        }
620        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
621        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
622        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
623        tcg_out8(s, tmp);
624
625        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
626    }
627
628    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
629    /* VEX.pp */
630    if (opc & P_DATA16) {
631        tmp |= 1;                          /* 0x66 */
632    } else if (opc & P_SIMDF3) {
633        tmp |= 2;                          /* 0xf3 */
634    } else if (opc & P_SIMDF2) {
635        tmp |= 3;                          /* 0xf2 */
636    }
637    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
638    tcg_out8(s, tmp);
639    tcg_out8(s, opc);
640}
641
642static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
643                             int rm, int index)
644{
645    /* The entire 4-byte evex prefix; with R' and V' set. */
646    uint32_t p = 0x08041062;
647    int mm, pp;
648
649    tcg_debug_assert(have_avx512vl);
650
651    /* EVEX.mm */
652    if (opc & P_EXT3A) {
653        mm = 3;
654    } else if (opc & P_EXT38) {
655        mm = 2;
656    } else if (opc & P_EXT) {
657        mm = 1;
658    } else {
659        g_assert_not_reached();
660    }
661
662    /* EVEX.pp */
663    if (opc & P_DATA16) {
664        pp = 1;                          /* 0x66 */
665    } else if (opc & P_SIMDF3) {
666        pp = 2;                          /* 0xf3 */
667    } else if (opc & P_SIMDF2) {
668        pp = 3;                          /* 0xf2 */
669    } else {
670        pp = 0;
671    }
672
673    p = deposit32(p, 8, 2, mm);
674    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
675    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
676    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
677    p = deposit32(p, 16, 2, pp);
678    p = deposit32(p, 19, 4, ~v);
679    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
680    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
681
682    tcg_out32(s, p);
683    tcg_out8(s, opc);
684}
685
686static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687{
688    if (opc & P_EVEX) {
689        tcg_out_evex_opc(s, opc, r, v, rm, 0);
690    } else {
691        tcg_out_vex_opc(s, opc, r, v, rm, 0);
692    }
693    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
694}
695
696/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
697   We handle either RM and INDEX missing with a negative value.  In 64-bit
698   mode for absolute addresses, ~RM is the size of the immediate operand
699   that will follow the instruction.  */
700
701static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
702                               int shift, intptr_t offset)
703{
704    int mod, len;
705
706    if (index < 0 && rm < 0) {
707        if (TCG_TARGET_REG_BITS == 64) {
708            /* Try for a rip-relative addressing mode.  This has replaced
709               the 32-bit-mode absolute addressing encoding.  */
710            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
711            intptr_t disp = offset - pc;
712            if (disp == (int32_t)disp) {
713                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
714                tcg_out32(s, disp);
715                return;
716            }
717
718            /* Try for an absolute address encoding.  This requires the
719               use of the MODRM+SIB encoding and is therefore larger than
720               rip-relative addressing.  */
721            if (offset == (int32_t)offset) {
722                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
723                tcg_out8(s, (4 << 3) | 5);
724                tcg_out32(s, offset);
725                return;
726            }
727
728            /* ??? The memory isn't directly addressable.  */
729            g_assert_not_reached();
730        } else {
731            /* Absolute address.  */
732            tcg_out8(s, (r << 3) | 5);
733            tcg_out32(s, offset);
734            return;
735        }
736    }
737
738    /* Find the length of the immediate addend.  Note that the encoding
739       that would be used for (%ebp) indicates absolute addressing.  */
740    if (rm < 0) {
741        mod = 0, len = 4, rm = 5;
742    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
743        mod = 0, len = 0;
744    } else if (offset == (int8_t)offset) {
745        mod = 0x40, len = 1;
746    } else {
747        mod = 0x80, len = 4;
748    }
749
750    /* Use a single byte MODRM format if possible.  Note that the encoding
751       that would be used for %esp is the escape to the two byte form.  */
752    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
753        /* Single byte MODRM format.  */
754        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
755    } else {
756        /* Two byte MODRM+SIB format.  */
757
758        /* Note that the encoding that would place %esp into the index
759           field indicates no index register.  In 64-bit mode, the REX.X
760           bit counts, so %r12 can be used as the index.  */
761        if (index < 0) {
762            index = 4;
763        } else {
764            tcg_debug_assert(index != TCG_REG_ESP);
765        }
766
767        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
768        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
769    }
770
771    if (len == 1) {
772        tcg_out8(s, offset);
773    } else if (len == 4) {
774        tcg_out32(s, offset);
775    }
776}
777
778static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
779                                     int index, int shift, intptr_t offset)
780{
781    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
782    tcg_out_sib_offset(s, r, rm, index, shift, offset);
783}
784
785static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
786                                         int rm, int index, int shift,
787                                         intptr_t offset)
788{
789    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
790    tcg_out_sib_offset(s, r, rm, index, shift, offset);
791}
792
793/* A simplification of the above with no index or shift.  */
794static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
795                                        int rm, intptr_t offset)
796{
797    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
798}
799
800static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
801                                            int v, int rm, intptr_t offset)
802{
803    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
804}
805
806/* Output an opcode with an expected reference to the constant pool.  */
807static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
808{
809    tcg_out_opc(s, opc, r, 0, 0);
810    /* Absolute for 32-bit, pc-relative for 64-bit.  */
811    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
812    tcg_out32(s, 0);
813}
814
815/* Output an opcode with an expected reference to the constant pool.  */
816static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
817{
818    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
819    /* Absolute for 32-bit, pc-relative for 64-bit.  */
820    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
821    tcg_out32(s, 0);
822}
823
824/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
825static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
826{
827    /* Propagate an opcode prefix, such as P_REXW.  */
828    int ext = subop & ~0x7;
829    subop &= 0x7;
830
831    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
832}
833
834static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
835{
836    int rexw = 0;
837
838    if (arg == ret) {
839        return true;
840    }
841    switch (type) {
842    case TCG_TYPE_I64:
843        rexw = P_REXW;
844        /* fallthru */
845    case TCG_TYPE_I32:
846        if (ret < 16) {
847            if (arg < 16) {
848                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
849            } else {
850                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
851            }
852        } else {
853            if (arg < 16) {
854                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
855            } else {
856                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
857            }
858        }
859        break;
860
861    case TCG_TYPE_V64:
862        tcg_debug_assert(ret >= 16 && arg >= 16);
863        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
864        break;
865    case TCG_TYPE_V128:
866        tcg_debug_assert(ret >= 16 && arg >= 16);
867        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
868        break;
869    case TCG_TYPE_V256:
870        tcg_debug_assert(ret >= 16 && arg >= 16);
871        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
872        break;
873
874    default:
875        g_assert_not_reached();
876    }
877    return true;
878}
879
880static const int avx2_dup_insn[4] = {
881    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
882    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
883};
884
885static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
886                            TCGReg r, TCGReg a)
887{
888    if (have_avx2) {
889        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
890        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
891    } else {
892        switch (vece) {
893        case MO_8:
894            /* ??? With zero in a register, use PSHUFB.  */
895            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
896            a = r;
897            /* FALLTHRU */
898        case MO_16:
899            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
900            a = r;
901            /* FALLTHRU */
902        case MO_32:
903            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
904            /* imm8 operand: all output lanes selected from input lane 0.  */
905            tcg_out8(s, 0);
906            break;
907        case MO_64:
908            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
909            break;
910        default:
911            g_assert_not_reached();
912        }
913    }
914    return true;
915}
916
917static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
918                             TCGReg r, TCGReg base, intptr_t offset)
919{
920    if (have_avx2) {
921        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
922        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
923                                 r, 0, base, offset);
924    } else {
925        switch (vece) {
926        case MO_64:
927            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
928            break;
929        case MO_32:
930            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
931            break;
932        case MO_16:
933            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
934            tcg_out8(s, 0); /* imm8 */
935            tcg_out_dup_vec(s, type, vece, r, r);
936            break;
937        case MO_8:
938            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
939            tcg_out8(s, 0); /* imm8 */
940            tcg_out_dup_vec(s, type, vece, r, r);
941            break;
942        default:
943            g_assert_not_reached();
944        }
945    }
946    return true;
947}
948
949static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
950                             TCGReg ret, int64_t arg)
951{
952    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
953
954    if (arg == 0) {
955        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
956        return;
957    }
958    if (arg == -1) {
959        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
960        return;
961    }
962
963    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
964        if (have_avx2) {
965            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
966        } else {
967            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
968        }
969        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
970    } else {
971        if (type == TCG_TYPE_V64) {
972            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
973        } else if (have_avx2) {
974            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
975        } else {
976            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
977        }
978        if (TCG_TARGET_REG_BITS == 64) {
979            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
980        } else {
981            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
982        }
983    }
984}
985
986static void tcg_out_movi_vec(TCGContext *s, TCGType type,
987                             TCGReg ret, tcg_target_long arg)
988{
989    if (arg == 0) {
990        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
991        return;
992    }
993    if (arg == -1) {
994        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
995        return;
996    }
997
998    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
999    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000    if (TCG_TARGET_REG_BITS == 64) {
1001        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1002    } else {
1003        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1004    }
1005}
1006
1007static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008                             TCGReg ret, tcg_target_long arg)
1009{
1010    tcg_target_long diff;
1011
1012    if (arg == 0) {
1013        tgen_arithr(s, ARITH_XOR, ret, ret);
1014        return;
1015    }
1016    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1018        tcg_out32(s, arg);
1019        return;
1020    }
1021    if (arg == (int32_t)arg) {
1022        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1023        tcg_out32(s, arg);
1024        return;
1025    }
1026
1027    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1028    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029    if (diff == (int32_t)diff) {
1030        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1032        tcg_out32(s, diff);
1033        return;
1034    }
1035
1036    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1037    tcg_out64(s, arg);
1038}
1039
1040static void tcg_out_movi(TCGContext *s, TCGType type,
1041                         TCGReg ret, tcg_target_long arg)
1042{
1043    switch (type) {
1044    case TCG_TYPE_I32:
1045#if TCG_TARGET_REG_BITS == 64
1046    case TCG_TYPE_I64:
1047#endif
1048        if (ret < 16) {
1049            tcg_out_movi_int(s, type, ret, arg);
1050        } else {
1051            tcg_out_movi_vec(s, type, ret, arg);
1052        }
1053        break;
1054    default:
1055        g_assert_not_reached();
1056    }
1057}
1058
1059static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1060{
1061    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1063    return true;
1064}
1065
1066static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067                             tcg_target_long imm)
1068{
1069    /* This function is only used for passing structs by reference. */
1070    tcg_debug_assert(imm == (int32_t)imm);
1071    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1072}
1073
1074static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1075{
1076    if (val == (int8_t)val) {
1077        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1078        tcg_out8(s, val);
1079    } else if (val == (int32_t)val) {
1080        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1081        tcg_out32(s, val);
1082    } else {
1083        g_assert_not_reached();
1084    }
1085}
1086
1087static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1088{
1089    /* Given the strength of x86 memory ordering, we only need care for
1090       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1091       faster than "mfence", so don't bother with the sse insn.  */
1092    if (a0 & TCG_MO_ST_LD) {
1093        tcg_out8(s, 0xf0);
1094        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1095        tcg_out8(s, 0);
1096    }
1097}
1098
1099static inline void tcg_out_push(TCGContext *s, int reg)
1100{
1101    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1102}
1103
1104static inline void tcg_out_pop(TCGContext *s, int reg)
1105{
1106    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1107}
1108
1109static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110                       TCGReg arg1, intptr_t arg2)
1111{
1112    switch (type) {
1113    case TCG_TYPE_I32:
1114        if (ret < 16) {
1115            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1116        } else {
1117            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1118        }
1119        break;
1120    case TCG_TYPE_I64:
1121        if (ret < 16) {
1122            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1123            break;
1124        }
1125        /* FALLTHRU */
1126    case TCG_TYPE_V64:
1127        /* There is no instruction that can validate 8-byte alignment.  */
1128        tcg_debug_assert(ret >= 16);
1129        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1130        break;
1131    case TCG_TYPE_V128:
1132        /*
1133         * The gvec infrastructure is asserts that v128 vector loads
1134         * and stores use a 16-byte aligned offset.  Validate that the
1135         * final pointer is aligned by using an insn that will SIGSEGV.
1136         */
1137        tcg_debug_assert(ret >= 16);
1138        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1139        break;
1140    case TCG_TYPE_V256:
1141        /*
1142         * The gvec infrastructure only requires 16-byte alignment,
1143         * so here we must use an unaligned load.
1144         */
1145        tcg_debug_assert(ret >= 16);
1146        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147                                 ret, 0, arg1, arg2);
1148        break;
1149    default:
1150        g_assert_not_reached();
1151    }
1152}
1153
1154static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155                       TCGReg arg1, intptr_t arg2)
1156{
1157    switch (type) {
1158    case TCG_TYPE_I32:
1159        if (arg < 16) {
1160            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1161        } else {
1162            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1163        }
1164        break;
1165    case TCG_TYPE_I64:
1166        if (arg < 16) {
1167            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1168            break;
1169        }
1170        /* FALLTHRU */
1171    case TCG_TYPE_V64:
1172        /* There is no instruction that can validate 8-byte alignment.  */
1173        tcg_debug_assert(arg >= 16);
1174        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1175        break;
1176    case TCG_TYPE_V128:
1177        /*
1178         * The gvec infrastructure is asserts that v128 vector loads
1179         * and stores use a 16-byte aligned offset.  Validate that the
1180         * final pointer is aligned by using an insn that will SIGSEGV.
1181         *
1182         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183         * for _WIN64, which must have SSE2 but may not have AVX.
1184         */
1185        tcg_debug_assert(arg >= 16);
1186        if (have_avx1) {
1187            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1188        } else {
1189            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1190        }
1191        break;
1192    case TCG_TYPE_V256:
1193        /*
1194         * The gvec infrastructure only requires 16-byte alignment,
1195         * so here we must use an unaligned store.
1196         */
1197        tcg_debug_assert(arg >= 16);
1198        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199                                 arg, 0, arg1, arg2);
1200        break;
1201    default:
1202        g_assert_not_reached();
1203    }
1204}
1205
1206static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207                        TCGReg base, intptr_t ofs)
1208{
1209    int rexw = 0;
1210    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211        if (val != (int32_t)val) {
1212            return false;
1213        }
1214        rexw = P_REXW;
1215    } else if (type != TCG_TYPE_I32) {
1216        return false;
1217    }
1218    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1219    tcg_out32(s, val);
1220    return true;
1221}
1222
1223static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1224{
1225    /* Propagate an opcode prefix, such as P_DATA16.  */
1226    int ext = subopc & ~0x7;
1227    subopc &= 0x7;
1228
1229    if (count == 1) {
1230        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1231    } else {
1232        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1233        tcg_out8(s, count);
1234    }
1235}
1236
1237static inline void tcg_out_bswap32(TCGContext *s, int reg)
1238{
1239    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1240}
1241
1242static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1243{
1244    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1245}
1246
1247static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1248{
1249    /* movzbl */
1250    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1252}
1253
1254static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1255{
1256    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1257    /* movsbl */
1258    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1260}
1261
1262static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1263{
1264    /* movzwl */
1265    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1266}
1267
1268static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1269{
1270    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1271    /* movsw[lq] */
1272    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1273}
1274
1275static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1276{
1277    /* 32-bit mov zero extends.  */
1278    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1279}
1280
1281static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1282{
1283    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1285}
1286
1287static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1288{
1289    tcg_out_ext32s(s, dest, src);
1290}
1291
1292static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1293{
1294    if (dest != src) {
1295        tcg_out_ext32u(s, dest, src);
1296    }
1297}
1298
1299static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1300{
1301    tcg_out_ext32u(s, dest, src);
1302}
1303
1304static inline void tcg_out_bswap64(TCGContext *s, int reg)
1305{
1306    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1307}
1308
1309static void tgen_arithi(TCGContext *s, int c, int r0,
1310                        tcg_target_long val, int cf)
1311{
1312    int rexw = 0;
1313
1314    if (TCG_TARGET_REG_BITS == 64) {
1315        rexw = c & -8;
1316        c &= 7;
1317    }
1318
1319    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1320       partial flags update stalls on Pentium4 and are not recommended
1321       by current Intel optimization manuals.  */
1322    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1323        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1324        if (TCG_TARGET_REG_BITS == 64) {
1325            /* The single-byte increment encodings are re-tasked as the
1326               REX prefixes.  Use the MODRM encoding.  */
1327            tcg_out_modrm(s, OPC_GRP5 + rexw,
1328                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1329        } else {
1330            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1331        }
1332        return;
1333    }
1334
1335    if (c == ARITH_AND) {
1336        if (TCG_TARGET_REG_BITS == 64) {
1337            if (val == 0xffffffffu) {
1338                tcg_out_ext32u(s, r0, r0);
1339                return;
1340            }
1341            if (val == (uint32_t)val) {
1342                /* AND with no high bits set can use a 32-bit operation.  */
1343                rexw = 0;
1344            }
1345        }
1346        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1347            tcg_out_ext8u(s, r0, r0);
1348            return;
1349        }
1350        if (val == 0xffffu) {
1351            tcg_out_ext16u(s, r0, r0);
1352            return;
1353        }
1354    }
1355
1356    if (val == (int8_t)val) {
1357        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1358        tcg_out8(s, val);
1359        return;
1360    }
1361    if (rexw == 0 || val == (int32_t)val) {
1362        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1363        tcg_out32(s, val);
1364        return;
1365    }
1366
1367    g_assert_not_reached();
1368}
1369
1370static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1371{
1372    if (val != 0) {
1373        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1374    }
1375}
1376
1377/* Set SMALL to force a short forward branch.  */
1378static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1379{
1380    int32_t val, val1;
1381
1382    if (l->has_value) {
1383        val = tcg_pcrel_diff(s, l->u.value_ptr);
1384        val1 = val - 2;
1385        if ((int8_t)val1 == val1) {
1386            if (opc == -1) {
1387                tcg_out8(s, OPC_JMP_short);
1388            } else {
1389                tcg_out8(s, OPC_JCC_short + opc);
1390            }
1391            tcg_out8(s, val1);
1392        } else {
1393            tcg_debug_assert(!small);
1394            if (opc == -1) {
1395                tcg_out8(s, OPC_JMP_long);
1396                tcg_out32(s, val - 5);
1397            } else {
1398                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1399                tcg_out32(s, val - 6);
1400            }
1401        }
1402    } else if (small) {
1403        if (opc == -1) {
1404            tcg_out8(s, OPC_JMP_short);
1405        } else {
1406            tcg_out8(s, OPC_JCC_short + opc);
1407        }
1408        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1409        s->code_ptr += 1;
1410    } else {
1411        if (opc == -1) {
1412            tcg_out8(s, OPC_JMP_long);
1413        } else {
1414            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1415        }
1416        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1417        s->code_ptr += 4;
1418    }
1419}
1420
1421static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1422                        int const_arg2, int rexw)
1423{
1424    if (const_arg2) {
1425        if (arg2 == 0) {
1426            /* test r, r */
1427            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1428        } else {
1429            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1430        }
1431    } else {
1432        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1433    }
1434}
1435
1436static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1437                             TCGArg arg1, TCGArg arg2, int const_arg2,
1438                             TCGLabel *label, int small)
1439{
1440    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1441    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1442}
1443
1444#if TCG_TARGET_REG_BITS == 64
1445static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1446                             TCGArg arg1, TCGArg arg2, int const_arg2,
1447                             TCGLabel *label, int small)
1448{
1449    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1450    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1451}
1452#else
1453/* XXX: we implement it at the target level to avoid having to
1454   handle cross basic blocks temporaries */
1455static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1456                            const int *const_args, int small)
1457{
1458    TCGLabel *label_next = gen_new_label();
1459    TCGLabel *label_this = arg_label(args[5]);
1460
1461    switch(args[4]) {
1462    case TCG_COND_EQ:
1463        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1464                         label_next, 1);
1465        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1466                         label_this, small);
1467        break;
1468    case TCG_COND_NE:
1469        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1470                         label_this, small);
1471        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1472                         label_this, small);
1473        break;
1474    case TCG_COND_LT:
1475        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1476                         label_this, small);
1477        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1478        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1479                         label_this, small);
1480        break;
1481    case TCG_COND_LE:
1482        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1483                         label_this, small);
1484        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1485        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1486                         label_this, small);
1487        break;
1488    case TCG_COND_GT:
1489        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1490                         label_this, small);
1491        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1492        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1493                         label_this, small);
1494        break;
1495    case TCG_COND_GE:
1496        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1497                         label_this, small);
1498        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1500                         label_this, small);
1501        break;
1502    case TCG_COND_LTU:
1503        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1504                         label_this, small);
1505        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1507                         label_this, small);
1508        break;
1509    case TCG_COND_LEU:
1510        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1511                         label_this, small);
1512        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1514                         label_this, small);
1515        break;
1516    case TCG_COND_GTU:
1517        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1518                         label_this, small);
1519        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1521                         label_this, small);
1522        break;
1523    case TCG_COND_GEU:
1524        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1525                         label_this, small);
1526        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1527        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1528                         label_this, small);
1529        break;
1530    default:
1531        g_assert_not_reached();
1532    }
1533    tcg_out_label(s, label_next);
1534}
1535#endif
1536
1537static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1538                              TCGArg arg1, TCGArg arg2, int const_arg2)
1539{
1540    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1541    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1542    tcg_out_ext8u(s, dest, dest);
1543}
1544
1545#if TCG_TARGET_REG_BITS == 64
1546static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1547                              TCGArg arg1, TCGArg arg2, int const_arg2)
1548{
1549    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1550    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1551    tcg_out_ext8u(s, dest, dest);
1552}
1553#else
1554static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1555                             const int *const_args)
1556{
1557    TCGArg new_args[6];
1558    TCGLabel *label_true, *label_over;
1559
1560    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1561
1562    if (args[0] == args[1] || args[0] == args[2]
1563        || (!const_args[3] && args[0] == args[3])
1564        || (!const_args[4] && args[0] == args[4])) {
1565        /* When the destination overlaps with one of the argument
1566           registers, don't do anything tricky.  */
1567        label_true = gen_new_label();
1568        label_over = gen_new_label();
1569
1570        new_args[5] = label_arg(label_true);
1571        tcg_out_brcond2(s, new_args, const_args+1, 1);
1572
1573        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1574        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1575        tcg_out_label(s, label_true);
1576
1577        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1578        tcg_out_label(s, label_over);
1579    } else {
1580        /* When the destination does not overlap one of the arguments,
1581           clear the destination first, jump if cond false, and emit an
1582           increment in the true case.  This results in smaller code.  */
1583
1584        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1585
1586        label_over = gen_new_label();
1587        new_args[4] = tcg_invert_cond(new_args[4]);
1588        new_args[5] = label_arg(label_over);
1589        tcg_out_brcond2(s, new_args, const_args+1, 1);
1590
1591        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1592        tcg_out_label(s, label_over);
1593    }
1594}
1595#endif
1596
1597static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1598                         TCGReg dest, TCGReg v1)
1599{
1600    if (have_cmov) {
1601        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1602    } else {
1603        TCGLabel *over = gen_new_label();
1604        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1605        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1606        tcg_out_label(s, over);
1607    }
1608}
1609
1610static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1611                              TCGReg c1, TCGArg c2, int const_c2,
1612                              TCGReg v1)
1613{
1614    tcg_out_cmp(s, c1, c2, const_c2, 0);
1615    tcg_out_cmov(s, cond, 0, dest, v1);
1616}
1617
1618#if TCG_TARGET_REG_BITS == 64
1619static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1620                              TCGReg c1, TCGArg c2, int const_c2,
1621                              TCGReg v1)
1622{
1623    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1624    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1625}
1626#endif
1627
1628static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1629                        TCGArg arg2, bool const_a2)
1630{
1631    if (have_bmi1) {
1632        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1633        if (const_a2) {
1634            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1635        } else {
1636            tcg_debug_assert(dest != arg2);
1637            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1638        }
1639    } else {
1640        tcg_debug_assert(dest != arg2);
1641        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1642        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1643    }
1644}
1645
1646static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1647                        TCGArg arg2, bool const_a2)
1648{
1649    if (have_lzcnt) {
1650        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1651        if (const_a2) {
1652            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1653        } else {
1654            tcg_debug_assert(dest != arg2);
1655            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1656        }
1657    } else {
1658        tcg_debug_assert(!const_a2);
1659        tcg_debug_assert(dest != arg1);
1660        tcg_debug_assert(dest != arg2);
1661
1662        /* Recall that the output of BSR is the index not the count.  */
1663        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1664        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1665
1666        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1667        tcg_out_cmp(s, arg1, 0, 1, rexw);
1668        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1669    }
1670}
1671
1672static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1673{
1674    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1675
1676    if (disp == (int32_t)disp) {
1677        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1678        tcg_out32(s, disp);
1679    } else {
1680        /* rip-relative addressing into the constant pool.
1681           This is 6 + 8 = 14 bytes, as compared to using an
1682           immediate load 10 + 6 = 16 bytes, plus we may
1683           be able to re-use the pool constant for more calls.  */
1684        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1685        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1686        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1687        tcg_out32(s, 0);
1688    }
1689}
1690
1691static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1692                         const TCGHelperInfo *info)
1693{
1694    tcg_out_branch(s, 1, dest);
1695
1696#ifndef _WIN32
1697    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1698        /*
1699         * The sysv i386 abi for struct return places a reference as the
1700         * first argument of the stack, and pops that argument with the
1701         * return statement.  Since we want to retain the aligned stack
1702         * pointer for the callee, we do not want to actually push that
1703         * argument before the call but rely on the normal store to the
1704         * stack slot.  But we do need to compensate for the pop in order
1705         * to reset our correct stack pointer value.
1706         * Pushing a garbage value back onto the stack is quickest.
1707         */
1708        tcg_out_push(s, TCG_REG_EAX);
1709    }
1710#endif
1711}
1712
1713static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1714{
1715    tcg_out_branch(s, 0, dest);
1716}
1717
1718static void tcg_out_nopn(TCGContext *s, int n)
1719{
1720    int i;
1721    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1722     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1723     * duplicate prefix, and all of the interesting recent cores can
1724     * decode and discard the duplicates in a single cycle.
1725     */
1726    tcg_debug_assert(n >= 1);
1727    for (i = 1; i < n; ++i) {
1728        tcg_out8(s, 0x66);
1729    }
1730    tcg_out8(s, 0x90);
1731}
1732
1733/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1734static void __attribute__((unused))
1735tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1736{
1737    /*
1738     * This is used for testing alignment, so we can usually use testb.
1739     * For i686, we have to use testl for %esi/%edi.
1740     */
1741    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1742        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1743        tcg_out8(s, i);
1744    } else {
1745        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1746        tcg_out32(s, i);
1747    }
1748}
1749
1750typedef struct {
1751    TCGReg base;
1752    int index;
1753    int ofs;
1754    int seg;
1755    TCGAtomAlign aa;
1756} HostAddress;
1757
1758bool tcg_target_has_memory_bswap(MemOp memop)
1759{
1760    TCGAtomAlign aa;
1761
1762    if (!have_movbe) {
1763        return false;
1764    }
1765    if ((memop & MO_SIZE) < MO_128) {
1766        return true;
1767    }
1768
1769    /*
1770     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1771     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1772     */
1773    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1774    return aa.atom < MO_128;
1775}
1776
1777/*
1778 * Because i686 has no register parameters and because x86_64 has xchg
1779 * to handle addr/data register overlap, we have placed all input arguments
1780 * before we need might need a scratch reg.
1781 *
1782 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1783 * a general-purpose scratch when we don't actually know it's available,
1784 * use the ra_gen hook to load into RAX if needed.
1785 */
1786#if TCG_TARGET_REG_BITS == 64
1787static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1788{
1789    if (arg < 0) {
1790        arg = TCG_REG_RAX;
1791    }
1792    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1793    return arg;
1794}
1795static const TCGLdstHelperParam ldst_helper_param = {
1796    .ra_gen = ldst_ra_gen
1797};
1798#else
1799static const TCGLdstHelperParam ldst_helper_param = { };
1800#endif
1801
1802static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1803                                TCGReg l, TCGReg h, TCGReg v)
1804{
1805    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1806
1807    /* vpmov{d,q} %v, %l */
1808    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1809    /* vpextr{d,q} $1, %v, %h */
1810    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1811    tcg_out8(s, 1);
1812}
1813
1814static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1815                                TCGReg v, TCGReg l, TCGReg h)
1816{
1817    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1818
1819    /* vmov{d,q} %l, %v */
1820    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1821    /* vpinsr{d,q} $1, %h, %v, %v */
1822    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1823    tcg_out8(s, 1);
1824}
1825
1826/*
1827 * Generate code for the slow path for a load at the end of block
1828 */
1829static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1830{
1831    MemOp opc = get_memop(l->oi);
1832    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1833
1834    /* resolve label address */
1835    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1836    if (label_ptr[1]) {
1837        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1838    }
1839
1840    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1841    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1842    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1843
1844    tcg_out_jmp(s, l->raddr);
1845    return true;
1846}
1847
1848/*
1849 * Generate code for the slow path for a store at the end of block
1850 */
1851static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1852{
1853    MemOp opc = get_memop(l->oi);
1854    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1855
1856    /* resolve label address */
1857    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1858    if (label_ptr[1]) {
1859        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1860    }
1861
1862    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1863    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1864
1865    tcg_out_jmp(s, l->raddr);
1866    return true;
1867}
1868
1869#ifndef CONFIG_SOFTMMU
1870static HostAddress x86_guest_base = {
1871    .index = -1
1872};
1873
1874#if defined(__x86_64__) && defined(__linux__)
1875# include <asm/prctl.h>
1876# include <sys/prctl.h>
1877int arch_prctl(int code, unsigned long addr);
1878static inline int setup_guest_base_seg(void)
1879{
1880    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1881        return P_GS;
1882    }
1883    return 0;
1884}
1885#elif defined(__x86_64__) && \
1886      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1887# include <machine/sysarch.h>
1888static inline int setup_guest_base_seg(void)
1889{
1890    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1891        return P_GS;
1892    }
1893    return 0;
1894}
1895#else
1896static inline int setup_guest_base_seg(void)
1897{
1898    return 0;
1899}
1900#endif /* setup_guest_base_seg */
1901#endif /* !SOFTMMU */
1902
1903#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1904
1905/*
1906 * For softmmu, perform the TLB load and compare.
1907 * For useronly, perform any required alignment tests.
1908 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1909 * is required and fill in @h with the host address for the fast path.
1910 */
1911static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1912                                           TCGReg addrlo, TCGReg addrhi,
1913                                           MemOpIdx oi, bool is_ld)
1914{
1915    TCGLabelQemuLdst *ldst = NULL;
1916    MemOp opc = get_memop(oi);
1917    MemOp s_bits = opc & MO_SIZE;
1918    unsigned a_mask;
1919
1920#ifdef CONFIG_SOFTMMU
1921    h->index = TCG_REG_L0;
1922    h->ofs = 0;
1923    h->seg = 0;
1924#else
1925    *h = x86_guest_base;
1926#endif
1927    h->base = addrlo;
1928    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
1929    a_mask = (1 << h->aa.align) - 1;
1930
1931#ifdef CONFIG_SOFTMMU
1932    int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1933                        : offsetof(CPUTLBEntry, addr_write);
1934    TCGType ttype = TCG_TYPE_I32;
1935    TCGType tlbtype = TCG_TYPE_I32;
1936    int trexw = 0, hrexw = 0, tlbrexw = 0;
1937    unsigned mem_index = get_mmuidx(oi);
1938    unsigned s_mask = (1 << s_bits) - 1;
1939    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
1940    int tlb_mask;
1941
1942    ldst = new_ldst_label(s);
1943    ldst->is_ld = is_ld;
1944    ldst->oi = oi;
1945    ldst->addrlo_reg = addrlo;
1946    ldst->addrhi_reg = addrhi;
1947
1948    if (TCG_TARGET_REG_BITS == 64) {
1949        ttype = s->addr_type;
1950        trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
1951        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1952            hrexw = P_REXW;
1953            if (s->page_bits + s->tlb_dyn_max_bits > 32) {
1954                tlbtype = TCG_TYPE_I64;
1955                tlbrexw = P_REXW;
1956            }
1957        }
1958    }
1959
1960    tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1961    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1962                   s->page_bits - CPU_TLB_ENTRY_BITS);
1963
1964    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1965                         fast_ofs + offsetof(CPUTLBDescFast, mask));
1966
1967    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1968                         fast_ofs + offsetof(CPUTLBDescFast, table));
1969
1970    /*
1971     * If the required alignment is at least as large as the access, simply
1972     * copy the address and mask.  For lesser alignments, check that we don't
1973     * cross pages for the complete access.
1974     */
1975    if (a_mask >= s_mask) {
1976        tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1977    } else {
1978        tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1979                             addrlo, s_mask - a_mask);
1980    }
1981    tlb_mask = s->page_mask | a_mask;
1982    tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1983
1984    /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1985    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1986                         TCG_REG_L1, TCG_REG_L0, cmp_ofs);
1987
1988    /* jne slow_path */
1989    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1990    ldst->label_ptr[0] = s->code_ptr;
1991    s->code_ptr += 4;
1992
1993    if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
1994        /* cmp 4(TCG_REG_L0), addrhi */
1995        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
1996
1997        /* jne slow_path */
1998        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1999        ldst->label_ptr[1] = s->code_ptr;
2000        s->code_ptr += 4;
2001    }
2002
2003    /* TLB Hit.  */
2004    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2005               offsetof(CPUTLBEntry, addend));
2006#else
2007    if (a_mask) {
2008        ldst = new_ldst_label(s);
2009
2010        ldst->is_ld = is_ld;
2011        ldst->oi = oi;
2012        ldst->addrlo_reg = addrlo;
2013        ldst->addrhi_reg = addrhi;
2014
2015        tcg_out_testi(s, addrlo, a_mask);
2016        /* jne slow_path */
2017        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2018        ldst->label_ptr[0] = s->code_ptr;
2019        s->code_ptr += 4;
2020    }
2021#endif
2022
2023    return ldst;
2024}
2025
2026static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2027                                   HostAddress h, TCGType type, MemOp memop)
2028{
2029    bool use_movbe = false;
2030    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2031    int movop = OPC_MOVL_GvEv;
2032
2033    /* Do big-endian loads with movbe.  */
2034    if (memop & MO_BSWAP) {
2035        tcg_debug_assert(have_movbe);
2036        use_movbe = true;
2037        movop = OPC_MOVBE_GyMy;
2038    }
2039
2040    switch (memop & MO_SSIZE) {
2041    case MO_UB:
2042        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2043                                 h.base, h.index, 0, h.ofs);
2044        break;
2045    case MO_SB:
2046        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2047                                 h.base, h.index, 0, h.ofs);
2048        break;
2049    case MO_UW:
2050        if (use_movbe) {
2051            /* There is no extending movbe; only low 16-bits are modified.  */
2052            if (datalo != h.base && datalo != h.index) {
2053                /* XOR breaks dependency chains.  */
2054                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2055                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2056                                         datalo, h.base, h.index, 0, h.ofs);
2057            } else {
2058                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2059                                         datalo, h.base, h.index, 0, h.ofs);
2060                tcg_out_ext16u(s, datalo, datalo);
2061            }
2062        } else {
2063            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2064                                     h.base, h.index, 0, h.ofs);
2065        }
2066        break;
2067    case MO_SW:
2068        if (use_movbe) {
2069            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2070                                     datalo, h.base, h.index, 0, h.ofs);
2071            tcg_out_ext16s(s, type, datalo, datalo);
2072        } else {
2073            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2074                                     datalo, h.base, h.index, 0, h.ofs);
2075        }
2076        break;
2077    case MO_UL:
2078        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2079                                 h.base, h.index, 0, h.ofs);
2080        break;
2081#if TCG_TARGET_REG_BITS == 64
2082    case MO_SL:
2083        if (use_movbe) {
2084            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2085                                     h.base, h.index, 0, h.ofs);
2086            tcg_out_ext32s(s, datalo, datalo);
2087        } else {
2088            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2089                                     h.base, h.index, 0, h.ofs);
2090        }
2091        break;
2092#endif
2093    case MO_UQ:
2094        if (TCG_TARGET_REG_BITS == 64) {
2095            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2096                                     h.base, h.index, 0, h.ofs);
2097            break;
2098        }
2099        if (use_movbe) {
2100            TCGReg t = datalo;
2101            datalo = datahi;
2102            datahi = t;
2103        }
2104        if (h.base == datalo || h.index == datalo) {
2105            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2106                                     h.base, h.index, 0, h.ofs);
2107            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2108            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2109        } else {
2110            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2111                                     h.base, h.index, 0, h.ofs);
2112            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2113                                     h.base, h.index, 0, h.ofs + 4);
2114        }
2115        break;
2116
2117    case MO_128:
2118        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2119
2120        /*
2121         * Without 16-byte atomicity, use integer regs.
2122         * That is where we want the data, and it allows bswaps.
2123         */
2124        if (h.aa.atom < MO_128) {
2125            if (use_movbe) {
2126                TCGReg t = datalo;
2127                datalo = datahi;
2128                datahi = t;
2129            }
2130            if (h.base == datalo || h.index == datalo) {
2131                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2132                                         h.base, h.index, 0, h.ofs);
2133                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2134                                     datalo, datahi, 0);
2135                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2136                                     datahi, datahi, 8);
2137            } else {
2138                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2139                                         h.base, h.index, 0, h.ofs);
2140                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2141                                         h.base, h.index, 0, h.ofs + 8);
2142            }
2143            break;
2144        }
2145
2146        /*
2147         * With 16-byte atomicity, a vector load is required.
2148         * If we already have 16-byte alignment, then VMOVDQA always works.
2149         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2150         * Else use we require a runtime test for alignment for VMOVDQA;
2151         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2152         */
2153        if (h.aa.align >= MO_128) {
2154            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2155                                         TCG_TMP_VEC, 0,
2156                                         h.base, h.index, 0, h.ofs);
2157        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2158            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2159                                         TCG_TMP_VEC, 0,
2160                                         h.base, h.index, 0, h.ofs);
2161        } else {
2162            TCGLabel *l1 = gen_new_label();
2163            TCGLabel *l2 = gen_new_label();
2164
2165            tcg_out_testi(s, h.base, 15);
2166            tcg_out_jxx(s, JCC_JNE, l1, true);
2167
2168            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2169                                         TCG_TMP_VEC, 0,
2170                                         h.base, h.index, 0, h.ofs);
2171            tcg_out_jxx(s, JCC_JMP, l2, true);
2172
2173            tcg_out_label(s, l1);
2174            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2175                                         TCG_TMP_VEC, 0,
2176                                         h.base, h.index, 0, h.ofs);
2177            tcg_out_label(s, l2);
2178        }
2179        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2180        break;
2181
2182    default:
2183        g_assert_not_reached();
2184    }
2185}
2186
2187static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2188                            TCGReg addrlo, TCGReg addrhi,
2189                            MemOpIdx oi, TCGType data_type)
2190{
2191    TCGLabelQemuLdst *ldst;
2192    HostAddress h;
2193
2194    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2195    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2196
2197    if (ldst) {
2198        ldst->type = data_type;
2199        ldst->datalo_reg = datalo;
2200        ldst->datahi_reg = datahi;
2201        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2202    }
2203}
2204
2205static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2206                                   HostAddress h, MemOp memop)
2207{
2208    bool use_movbe = false;
2209    int movop = OPC_MOVL_EvGv;
2210
2211    /*
2212     * Do big-endian stores with movbe or softmmu.
2213     * User-only without movbe will have its swapping done generically.
2214     */
2215    if (memop & MO_BSWAP) {
2216        tcg_debug_assert(have_movbe);
2217        use_movbe = true;
2218        movop = OPC_MOVBE_MyGy;
2219    }
2220
2221    switch (memop & MO_SIZE) {
2222    case MO_8:
2223        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2224        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2225        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2226                                 datalo, h.base, h.index, 0, h.ofs);
2227        break;
2228    case MO_16:
2229        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2230                                 h.base, h.index, 0, h.ofs);
2231        break;
2232    case MO_32:
2233        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2234                                 h.base, h.index, 0, h.ofs);
2235        break;
2236    case MO_64:
2237        if (TCG_TARGET_REG_BITS == 64) {
2238            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2239                                     h.base, h.index, 0, h.ofs);
2240        } else {
2241            if (use_movbe) {
2242                TCGReg t = datalo;
2243                datalo = datahi;
2244                datahi = t;
2245            }
2246            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2247                                     h.base, h.index, 0, h.ofs);
2248            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2249                                     h.base, h.index, 0, h.ofs + 4);
2250        }
2251        break;
2252
2253    case MO_128:
2254        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2255
2256        /*
2257         * Without 16-byte atomicity, use integer regs.
2258         * That is where we have the data, and it allows bswaps.
2259         */
2260        if (h.aa.atom < MO_128) {
2261            if (use_movbe) {
2262                TCGReg t = datalo;
2263                datalo = datahi;
2264                datahi = t;
2265            }
2266            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2267                                     h.base, h.index, 0, h.ofs);
2268            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2269                                     h.base, h.index, 0, h.ofs + 8);
2270            break;
2271        }
2272
2273        /*
2274         * With 16-byte atomicity, a vector store is required.
2275         * If we already have 16-byte alignment, then VMOVDQA always works.
2276         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2277         * Else use we require a runtime test for alignment for VMOVDQA;
2278         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2279         */
2280        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2281        if (h.aa.align >= MO_128) {
2282            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2283                                         TCG_TMP_VEC, 0,
2284                                         h.base, h.index, 0, h.ofs);
2285        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2286            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2287                                         TCG_TMP_VEC, 0,
2288                                         h.base, h.index, 0, h.ofs);
2289        } else {
2290            TCGLabel *l1 = gen_new_label();
2291            TCGLabel *l2 = gen_new_label();
2292
2293            tcg_out_testi(s, h.base, 15);
2294            tcg_out_jxx(s, JCC_JNE, l1, true);
2295
2296            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2297                                         TCG_TMP_VEC, 0,
2298                                         h.base, h.index, 0, h.ofs);
2299            tcg_out_jxx(s, JCC_JMP, l2, true);
2300
2301            tcg_out_label(s, l1);
2302            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2303                                         TCG_TMP_VEC, 0,
2304                                         h.base, h.index, 0, h.ofs);
2305            tcg_out_label(s, l2);
2306        }
2307        break;
2308
2309    default:
2310        g_assert_not_reached();
2311    }
2312}
2313
2314static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2315                            TCGReg addrlo, TCGReg addrhi,
2316                            MemOpIdx oi, TCGType data_type)
2317{
2318    TCGLabelQemuLdst *ldst;
2319    HostAddress h;
2320
2321    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2322    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2323
2324    if (ldst) {
2325        ldst->type = data_type;
2326        ldst->datalo_reg = datalo;
2327        ldst->datahi_reg = datahi;
2328        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2329    }
2330}
2331
2332static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2333{
2334    /* Reuse the zeroing that exists for goto_ptr.  */
2335    if (a0 == 0) {
2336        tcg_out_jmp(s, tcg_code_gen_epilogue);
2337    } else {
2338        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2339        tcg_out_jmp(s, tb_ret_addr);
2340    }
2341}
2342
2343static void tcg_out_goto_tb(TCGContext *s, int which)
2344{
2345    /*
2346     * Jump displacement must be aligned for atomic patching;
2347     * see if we need to add extra nops before jump
2348     */
2349    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2350    if (gap != 1) {
2351        tcg_out_nopn(s, gap - 1);
2352    }
2353    tcg_out8(s, OPC_JMP_long); /* jmp im */
2354    set_jmp_insn_offset(s, which);
2355    tcg_out32(s, 0);
2356    set_jmp_reset_offset(s, which);
2357}
2358
2359void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2360                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2361{
2362    /* patch the branch destination */
2363    uintptr_t addr = tb->jmp_target_addr[n];
2364    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2365    /* no need to flush icache explicitly */
2366}
2367
2368static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2369                              const TCGArg args[TCG_MAX_OP_ARGS],
2370                              const int const_args[TCG_MAX_OP_ARGS])
2371{
2372    TCGArg a0, a1, a2;
2373    int c, const_a2, vexop, rexw = 0;
2374
2375#if TCG_TARGET_REG_BITS == 64
2376# define OP_32_64(x) \
2377        case glue(glue(INDEX_op_, x), _i64): \
2378            rexw = P_REXW; /* FALLTHRU */    \
2379        case glue(glue(INDEX_op_, x), _i32)
2380#else
2381# define OP_32_64(x) \
2382        case glue(glue(INDEX_op_, x), _i32)
2383#endif
2384
2385    /* Hoist the loads of the most common arguments.  */
2386    a0 = args[0];
2387    a1 = args[1];
2388    a2 = args[2];
2389    const_a2 = const_args[2];
2390
2391    switch (opc) {
2392    case INDEX_op_goto_ptr:
2393        /* jmp to the given host address (could be epilogue) */
2394        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2395        break;
2396    case INDEX_op_br:
2397        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2398        break;
2399    OP_32_64(ld8u):
2400        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2401        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2402        break;
2403    OP_32_64(ld8s):
2404        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2405        break;
2406    OP_32_64(ld16u):
2407        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2408        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2409        break;
2410    OP_32_64(ld16s):
2411        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2412        break;
2413#if TCG_TARGET_REG_BITS == 64
2414    case INDEX_op_ld32u_i64:
2415#endif
2416    case INDEX_op_ld_i32:
2417        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2418        break;
2419
2420    OP_32_64(st8):
2421        if (const_args[0]) {
2422            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2423            tcg_out8(s, a0);
2424        } else {
2425            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2426        }
2427        break;
2428    OP_32_64(st16):
2429        if (const_args[0]) {
2430            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2431            tcg_out16(s, a0);
2432        } else {
2433            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2434        }
2435        break;
2436#if TCG_TARGET_REG_BITS == 64
2437    case INDEX_op_st32_i64:
2438#endif
2439    case INDEX_op_st_i32:
2440        if (const_args[0]) {
2441            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2442            tcg_out32(s, a0);
2443        } else {
2444            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2445        }
2446        break;
2447
2448    OP_32_64(add):
2449        /* For 3-operand addition, use LEA.  */
2450        if (a0 != a1) {
2451            TCGArg c3 = 0;
2452            if (const_a2) {
2453                c3 = a2, a2 = -1;
2454            } else if (a0 == a2) {
2455                /* Watch out for dest = src + dest, since we've removed
2456                   the matching constraint on the add.  */
2457                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2458                break;
2459            }
2460
2461            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2462            break;
2463        }
2464        c = ARITH_ADD;
2465        goto gen_arith;
2466    OP_32_64(sub):
2467        c = ARITH_SUB;
2468        goto gen_arith;
2469    OP_32_64(and):
2470        c = ARITH_AND;
2471        goto gen_arith;
2472    OP_32_64(or):
2473        c = ARITH_OR;
2474        goto gen_arith;
2475    OP_32_64(xor):
2476        c = ARITH_XOR;
2477        goto gen_arith;
2478    gen_arith:
2479        if (const_a2) {
2480            tgen_arithi(s, c + rexw, a0, a2, 0);
2481        } else {
2482            tgen_arithr(s, c + rexw, a0, a2);
2483        }
2484        break;
2485
2486    OP_32_64(andc):
2487        if (const_a2) {
2488            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2489            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2490        } else {
2491            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2492        }
2493        break;
2494
2495    OP_32_64(mul):
2496        if (const_a2) {
2497            int32_t val;
2498            val = a2;
2499            if (val == (int8_t)val) {
2500                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2501                tcg_out8(s, val);
2502            } else {
2503                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2504                tcg_out32(s, val);
2505            }
2506        } else {
2507            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2508        }
2509        break;
2510
2511    OP_32_64(div2):
2512        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2513        break;
2514    OP_32_64(divu2):
2515        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2516        break;
2517
2518    OP_32_64(shl):
2519        /* For small constant 3-operand shift, use LEA.  */
2520        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2521            if (a2 - 1 == 0) {
2522                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2523                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2524            } else {
2525                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2526                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2527            }
2528            break;
2529        }
2530        c = SHIFT_SHL;
2531        vexop = OPC_SHLX;
2532        goto gen_shift_maybe_vex;
2533    OP_32_64(shr):
2534        c = SHIFT_SHR;
2535        vexop = OPC_SHRX;
2536        goto gen_shift_maybe_vex;
2537    OP_32_64(sar):
2538        c = SHIFT_SAR;
2539        vexop = OPC_SARX;
2540        goto gen_shift_maybe_vex;
2541    OP_32_64(rotl):
2542        c = SHIFT_ROL;
2543        goto gen_shift;
2544    OP_32_64(rotr):
2545        c = SHIFT_ROR;
2546        goto gen_shift;
2547    gen_shift_maybe_vex:
2548        if (have_bmi2) {
2549            if (!const_a2) {
2550                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2551                break;
2552            }
2553            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2554        }
2555        /* FALLTHRU */
2556    gen_shift:
2557        if (const_a2) {
2558            tcg_out_shifti(s, c + rexw, a0, a2);
2559        } else {
2560            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2561        }
2562        break;
2563
2564    OP_32_64(ctz):
2565        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2566        break;
2567    OP_32_64(clz):
2568        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2569        break;
2570    OP_32_64(ctpop):
2571        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2572        break;
2573
2574    case INDEX_op_brcond_i32:
2575        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2576        break;
2577    case INDEX_op_setcond_i32:
2578        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2579        break;
2580    case INDEX_op_movcond_i32:
2581        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2582        break;
2583
2584    OP_32_64(bswap16):
2585        if (a2 & TCG_BSWAP_OS) {
2586            /* Output must be sign-extended. */
2587            if (rexw) {
2588                tcg_out_bswap64(s, a0);
2589                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2590            } else {
2591                tcg_out_bswap32(s, a0);
2592                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2593            }
2594        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2595            /* Output must be zero-extended, but input isn't. */
2596            tcg_out_bswap32(s, a0);
2597            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2598        } else {
2599            tcg_out_rolw_8(s, a0);
2600        }
2601        break;
2602    OP_32_64(bswap32):
2603        tcg_out_bswap32(s, a0);
2604        if (rexw && (a2 & TCG_BSWAP_OS)) {
2605            tcg_out_ext32s(s, a0, a0);
2606        }
2607        break;
2608
2609    OP_32_64(neg):
2610        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2611        break;
2612    OP_32_64(not):
2613        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2614        break;
2615
2616    case INDEX_op_qemu_ld_a64_i32:
2617        if (TCG_TARGET_REG_BITS == 32) {
2618            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2619            break;
2620        }
2621        /* fall through */
2622    case INDEX_op_qemu_ld_a32_i32:
2623        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2624        break;
2625    case INDEX_op_qemu_ld_a32_i64:
2626        if (TCG_TARGET_REG_BITS == 64) {
2627            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2628        } else {
2629            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2630        }
2631        break;
2632    case INDEX_op_qemu_ld_a64_i64:
2633        if (TCG_TARGET_REG_BITS == 64) {
2634            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2635        } else {
2636            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2637        }
2638        break;
2639    case INDEX_op_qemu_ld_a32_i128:
2640    case INDEX_op_qemu_ld_a64_i128:
2641        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2642        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2643        break;
2644
2645    case INDEX_op_qemu_st_a64_i32:
2646    case INDEX_op_qemu_st8_a64_i32:
2647        if (TCG_TARGET_REG_BITS == 32) {
2648            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2649            break;
2650        }
2651        /* fall through */
2652    case INDEX_op_qemu_st_a32_i32:
2653    case INDEX_op_qemu_st8_a32_i32:
2654        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2655        break;
2656    case INDEX_op_qemu_st_a32_i64:
2657        if (TCG_TARGET_REG_BITS == 64) {
2658            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2659        } else {
2660            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2661        }
2662        break;
2663    case INDEX_op_qemu_st_a64_i64:
2664        if (TCG_TARGET_REG_BITS == 64) {
2665            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2666        } else {
2667            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2668        }
2669        break;
2670    case INDEX_op_qemu_st_a32_i128:
2671    case INDEX_op_qemu_st_a64_i128:
2672        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2673        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2674        break;
2675
2676    OP_32_64(mulu2):
2677        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2678        break;
2679    OP_32_64(muls2):
2680        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2681        break;
2682    OP_32_64(add2):
2683        if (const_args[4]) {
2684            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2685        } else {
2686            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2687        }
2688        if (const_args[5]) {
2689            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2690        } else {
2691            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2692        }
2693        break;
2694    OP_32_64(sub2):
2695        if (const_args[4]) {
2696            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2697        } else {
2698            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2699        }
2700        if (const_args[5]) {
2701            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2702        } else {
2703            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2704        }
2705        break;
2706
2707#if TCG_TARGET_REG_BITS == 32
2708    case INDEX_op_brcond2_i32:
2709        tcg_out_brcond2(s, args, const_args, 0);
2710        break;
2711    case INDEX_op_setcond2_i32:
2712        tcg_out_setcond2(s, args, const_args);
2713        break;
2714#else /* TCG_TARGET_REG_BITS == 64 */
2715    case INDEX_op_ld32s_i64:
2716        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2717        break;
2718    case INDEX_op_ld_i64:
2719        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2720        break;
2721    case INDEX_op_st_i64:
2722        if (const_args[0]) {
2723            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2724            tcg_out32(s, a0);
2725        } else {
2726            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2727        }
2728        break;
2729
2730    case INDEX_op_brcond_i64:
2731        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2732        break;
2733    case INDEX_op_setcond_i64:
2734        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2735        break;
2736    case INDEX_op_movcond_i64:
2737        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2738        break;
2739
2740    case INDEX_op_bswap64_i64:
2741        tcg_out_bswap64(s, a0);
2742        break;
2743    case INDEX_op_extrh_i64_i32:
2744        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2745        break;
2746#endif
2747
2748    OP_32_64(deposit):
2749        if (args[3] == 0 && args[4] == 8) {
2750            /* load bits 0..7 */
2751            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2752        } else if (args[3] == 8 && args[4] == 8) {
2753            /* load bits 8..15 */
2754            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2755        } else if (args[3] == 0 && args[4] == 16) {
2756            /* load bits 0..15 */
2757            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2758        } else {
2759            g_assert_not_reached();
2760        }
2761        break;
2762
2763    case INDEX_op_extract_i64:
2764        if (a2 + args[3] == 32) {
2765            /* This is a 32-bit zero-extending right shift.  */
2766            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2767            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2768            break;
2769        }
2770        /* FALLTHRU */
2771    case INDEX_op_extract_i32:
2772        /* On the off-chance that we can use the high-byte registers.
2773           Otherwise we emit the same ext16 + shift pattern that we
2774           would have gotten from the normal tcg-op.c expansion.  */
2775        tcg_debug_assert(a2 == 8 && args[3] == 8);
2776        if (a1 < 4 && a0 < 8) {
2777            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2778        } else {
2779            tcg_out_ext16u(s, a0, a1);
2780            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2781        }
2782        break;
2783
2784    case INDEX_op_sextract_i32:
2785        /* We don't implement sextract_i64, as we cannot sign-extend to
2786           64-bits without using the REX prefix that explicitly excludes
2787           access to the high-byte registers.  */
2788        tcg_debug_assert(a2 == 8 && args[3] == 8);
2789        if (a1 < 4 && a0 < 8) {
2790            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2791        } else {
2792            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2793            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2794        }
2795        break;
2796
2797    OP_32_64(extract2):
2798        /* Note that SHRD outputs to the r/m operand.  */
2799        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2800        tcg_out8(s, args[3]);
2801        break;
2802
2803    case INDEX_op_mb:
2804        tcg_out_mb(s, a0);
2805        break;
2806    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2807    case INDEX_op_mov_i64:
2808    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2809    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2810    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2811    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2812    case INDEX_op_ext8s_i64:
2813    case INDEX_op_ext8u_i32:
2814    case INDEX_op_ext8u_i64:
2815    case INDEX_op_ext16s_i32:
2816    case INDEX_op_ext16s_i64:
2817    case INDEX_op_ext16u_i32:
2818    case INDEX_op_ext16u_i64:
2819    case INDEX_op_ext32s_i64:
2820    case INDEX_op_ext32u_i64:
2821    case INDEX_op_ext_i32_i64:
2822    case INDEX_op_extu_i32_i64:
2823    case INDEX_op_extrl_i64_i32:
2824    default:
2825        g_assert_not_reached();
2826    }
2827
2828#undef OP_32_64
2829}
2830
2831static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2832                           unsigned vecl, unsigned vece,
2833                           const TCGArg args[TCG_MAX_OP_ARGS],
2834                           const int const_args[TCG_MAX_OP_ARGS])
2835{
2836    static int const add_insn[4] = {
2837        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2838    };
2839    static int const ssadd_insn[4] = {
2840        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2841    };
2842    static int const usadd_insn[4] = {
2843        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2844    };
2845    static int const sub_insn[4] = {
2846        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2847    };
2848    static int const sssub_insn[4] = {
2849        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2850    };
2851    static int const ussub_insn[4] = {
2852        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2853    };
2854    static int const mul_insn[4] = {
2855        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2856    };
2857    static int const shift_imm_insn[4] = {
2858        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2859    };
2860    static int const cmpeq_insn[4] = {
2861        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2862    };
2863    static int const cmpgt_insn[4] = {
2864        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2865    };
2866    static int const punpckl_insn[4] = {
2867        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2868    };
2869    static int const punpckh_insn[4] = {
2870        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2871    };
2872    static int const packss_insn[4] = {
2873        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2874    };
2875    static int const packus_insn[4] = {
2876        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2877    };
2878    static int const smin_insn[4] = {
2879        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2880    };
2881    static int const smax_insn[4] = {
2882        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2883    };
2884    static int const umin_insn[4] = {
2885        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2886    };
2887    static int const umax_insn[4] = {
2888        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2889    };
2890    static int const rotlv_insn[4] = {
2891        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2892    };
2893    static int const rotrv_insn[4] = {
2894        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2895    };
2896    static int const shlv_insn[4] = {
2897        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2898    };
2899    static int const shrv_insn[4] = {
2900        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2901    };
2902    static int const sarv_insn[4] = {
2903        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2904    };
2905    static int const shls_insn[4] = {
2906        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2907    };
2908    static int const shrs_insn[4] = {
2909        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2910    };
2911    static int const sars_insn[4] = {
2912        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2913    };
2914    static int const vpshldi_insn[4] = {
2915        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2916    };
2917    static int const vpshldv_insn[4] = {
2918        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2919    };
2920    static int const vpshrdv_insn[4] = {
2921        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2922    };
2923    static int const abs_insn[4] = {
2924        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2925    };
2926
2927    TCGType type = vecl + TCG_TYPE_V64;
2928    int insn, sub;
2929    TCGArg a0, a1, a2, a3;
2930
2931    a0 = args[0];
2932    a1 = args[1];
2933    a2 = args[2];
2934
2935    switch (opc) {
2936    case INDEX_op_add_vec:
2937        insn = add_insn[vece];
2938        goto gen_simd;
2939    case INDEX_op_ssadd_vec:
2940        insn = ssadd_insn[vece];
2941        goto gen_simd;
2942    case INDEX_op_usadd_vec:
2943        insn = usadd_insn[vece];
2944        goto gen_simd;
2945    case INDEX_op_sub_vec:
2946        insn = sub_insn[vece];
2947        goto gen_simd;
2948    case INDEX_op_sssub_vec:
2949        insn = sssub_insn[vece];
2950        goto gen_simd;
2951    case INDEX_op_ussub_vec:
2952        insn = ussub_insn[vece];
2953        goto gen_simd;
2954    case INDEX_op_mul_vec:
2955        insn = mul_insn[vece];
2956        goto gen_simd;
2957    case INDEX_op_and_vec:
2958        insn = OPC_PAND;
2959        goto gen_simd;
2960    case INDEX_op_or_vec:
2961        insn = OPC_POR;
2962        goto gen_simd;
2963    case INDEX_op_xor_vec:
2964        insn = OPC_PXOR;
2965        goto gen_simd;
2966    case INDEX_op_smin_vec:
2967        insn = smin_insn[vece];
2968        goto gen_simd;
2969    case INDEX_op_umin_vec:
2970        insn = umin_insn[vece];
2971        goto gen_simd;
2972    case INDEX_op_smax_vec:
2973        insn = smax_insn[vece];
2974        goto gen_simd;
2975    case INDEX_op_umax_vec:
2976        insn = umax_insn[vece];
2977        goto gen_simd;
2978    case INDEX_op_shlv_vec:
2979        insn = shlv_insn[vece];
2980        goto gen_simd;
2981    case INDEX_op_shrv_vec:
2982        insn = shrv_insn[vece];
2983        goto gen_simd;
2984    case INDEX_op_sarv_vec:
2985        insn = sarv_insn[vece];
2986        goto gen_simd;
2987    case INDEX_op_rotlv_vec:
2988        insn = rotlv_insn[vece];
2989        goto gen_simd;
2990    case INDEX_op_rotrv_vec:
2991        insn = rotrv_insn[vece];
2992        goto gen_simd;
2993    case INDEX_op_shls_vec:
2994        insn = shls_insn[vece];
2995        goto gen_simd;
2996    case INDEX_op_shrs_vec:
2997        insn = shrs_insn[vece];
2998        goto gen_simd;
2999    case INDEX_op_sars_vec:
3000        insn = sars_insn[vece];
3001        goto gen_simd;
3002    case INDEX_op_x86_punpckl_vec:
3003        insn = punpckl_insn[vece];
3004        goto gen_simd;
3005    case INDEX_op_x86_punpckh_vec:
3006        insn = punpckh_insn[vece];
3007        goto gen_simd;
3008    case INDEX_op_x86_packss_vec:
3009        insn = packss_insn[vece];
3010        goto gen_simd;
3011    case INDEX_op_x86_packus_vec:
3012        insn = packus_insn[vece];
3013        goto gen_simd;
3014    case INDEX_op_x86_vpshldv_vec:
3015        insn = vpshldv_insn[vece];
3016        a1 = a2;
3017        a2 = args[3];
3018        goto gen_simd;
3019    case INDEX_op_x86_vpshrdv_vec:
3020        insn = vpshrdv_insn[vece];
3021        a1 = a2;
3022        a2 = args[3];
3023        goto gen_simd;
3024#if TCG_TARGET_REG_BITS == 32
3025    case INDEX_op_dup2_vec:
3026        /* First merge the two 32-bit inputs to a single 64-bit element. */
3027        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3028        /* Then replicate the 64-bit elements across the rest of the vector. */
3029        if (type != TCG_TYPE_V64) {
3030            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3031        }
3032        break;
3033#endif
3034    case INDEX_op_abs_vec:
3035        insn = abs_insn[vece];
3036        a2 = a1;
3037        a1 = 0;
3038        goto gen_simd;
3039    gen_simd:
3040        tcg_debug_assert(insn != OPC_UD2);
3041        if (type == TCG_TYPE_V256) {
3042            insn |= P_VEXL;
3043        }
3044        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3045        break;
3046
3047    case INDEX_op_cmp_vec:
3048        sub = args[3];
3049        if (sub == TCG_COND_EQ) {
3050            insn = cmpeq_insn[vece];
3051        } else if (sub == TCG_COND_GT) {
3052            insn = cmpgt_insn[vece];
3053        } else {
3054            g_assert_not_reached();
3055        }
3056        goto gen_simd;
3057
3058    case INDEX_op_andc_vec:
3059        insn = OPC_PANDN;
3060        if (type == TCG_TYPE_V256) {
3061            insn |= P_VEXL;
3062        }
3063        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3064        break;
3065
3066    case INDEX_op_shli_vec:
3067        insn = shift_imm_insn[vece];
3068        sub = 6;
3069        goto gen_shift;
3070    case INDEX_op_shri_vec:
3071        insn = shift_imm_insn[vece];
3072        sub = 2;
3073        goto gen_shift;
3074    case INDEX_op_sari_vec:
3075        if (vece == MO_64) {
3076            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3077        } else {
3078            insn = shift_imm_insn[vece];
3079        }
3080        sub = 4;
3081        goto gen_shift;
3082    case INDEX_op_rotli_vec:
3083        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3084        if (vece == MO_64) {
3085            insn |= P_VEXW;
3086        }
3087        sub = 1;
3088        goto gen_shift;
3089    gen_shift:
3090        tcg_debug_assert(vece != MO_8);
3091        if (type == TCG_TYPE_V256) {
3092            insn |= P_VEXL;
3093        }
3094        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3095        tcg_out8(s, a2);
3096        break;
3097
3098    case INDEX_op_ld_vec:
3099        tcg_out_ld(s, type, a0, a1, a2);
3100        break;
3101    case INDEX_op_st_vec:
3102        tcg_out_st(s, type, a0, a1, a2);
3103        break;
3104    case INDEX_op_dupm_vec:
3105        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3106        break;
3107
3108    case INDEX_op_x86_shufps_vec:
3109        insn = OPC_SHUFPS;
3110        sub = args[3];
3111        goto gen_simd_imm8;
3112    case INDEX_op_x86_blend_vec:
3113        if (vece == MO_16) {
3114            insn = OPC_PBLENDW;
3115        } else if (vece == MO_32) {
3116            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3117        } else {
3118            g_assert_not_reached();
3119        }
3120        sub = args[3];
3121        goto gen_simd_imm8;
3122    case INDEX_op_x86_vperm2i128_vec:
3123        insn = OPC_VPERM2I128;
3124        sub = args[3];
3125        goto gen_simd_imm8;
3126    case INDEX_op_x86_vpshldi_vec:
3127        insn = vpshldi_insn[vece];
3128        sub = args[3];
3129        goto gen_simd_imm8;
3130
3131    case INDEX_op_not_vec:
3132        insn = OPC_VPTERNLOGQ;
3133        a2 = a1;
3134        sub = 0x33; /* !B */
3135        goto gen_simd_imm8;
3136    case INDEX_op_nor_vec:
3137        insn = OPC_VPTERNLOGQ;
3138        sub = 0x11; /* norCB */
3139        goto gen_simd_imm8;
3140    case INDEX_op_nand_vec:
3141        insn = OPC_VPTERNLOGQ;
3142        sub = 0x77; /* nandCB */
3143        goto gen_simd_imm8;
3144    case INDEX_op_eqv_vec:
3145        insn = OPC_VPTERNLOGQ;
3146        sub = 0x99; /* xnorCB */
3147        goto gen_simd_imm8;
3148    case INDEX_op_orc_vec:
3149        insn = OPC_VPTERNLOGQ;
3150        sub = 0xdd; /* orB!C */
3151        goto gen_simd_imm8;
3152
3153    case INDEX_op_bitsel_vec:
3154        insn = OPC_VPTERNLOGQ;
3155        a3 = args[3];
3156        if (a0 == a1) {
3157            a1 = a2;
3158            a2 = a3;
3159            sub = 0xca; /* A?B:C */
3160        } else if (a0 == a2) {
3161            a2 = a3;
3162            sub = 0xe2; /* B?A:C */
3163        } else {
3164            tcg_out_mov(s, type, a0, a3);
3165            sub = 0xb8; /* B?C:A */
3166        }
3167        goto gen_simd_imm8;
3168
3169    gen_simd_imm8:
3170        tcg_debug_assert(insn != OPC_UD2);
3171        if (type == TCG_TYPE_V256) {
3172            insn |= P_VEXL;
3173        }
3174        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3175        tcg_out8(s, sub);
3176        break;
3177
3178    case INDEX_op_x86_vpblendvb_vec:
3179        insn = OPC_VPBLENDVB;
3180        if (type == TCG_TYPE_V256) {
3181            insn |= P_VEXL;
3182        }
3183        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3184        tcg_out8(s, args[3] << 4);
3185        break;
3186
3187    case INDEX_op_x86_psrldq_vec:
3188        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3189        tcg_out8(s, a2);
3190        break;
3191
3192    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3193    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3194    default:
3195        g_assert_not_reached();
3196    }
3197}
3198
3199static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3200{
3201    switch (op) {
3202    case INDEX_op_goto_ptr:
3203        return C_O0_I1(r);
3204
3205    case INDEX_op_ld8u_i32:
3206    case INDEX_op_ld8u_i64:
3207    case INDEX_op_ld8s_i32:
3208    case INDEX_op_ld8s_i64:
3209    case INDEX_op_ld16u_i32:
3210    case INDEX_op_ld16u_i64:
3211    case INDEX_op_ld16s_i32:
3212    case INDEX_op_ld16s_i64:
3213    case INDEX_op_ld_i32:
3214    case INDEX_op_ld32u_i64:
3215    case INDEX_op_ld32s_i64:
3216    case INDEX_op_ld_i64:
3217        return C_O1_I1(r, r);
3218
3219    case INDEX_op_st8_i32:
3220    case INDEX_op_st8_i64:
3221        return C_O0_I2(qi, r);
3222
3223    case INDEX_op_st16_i32:
3224    case INDEX_op_st16_i64:
3225    case INDEX_op_st_i32:
3226    case INDEX_op_st32_i64:
3227        return C_O0_I2(ri, r);
3228
3229    case INDEX_op_st_i64:
3230        return C_O0_I2(re, r);
3231
3232    case INDEX_op_add_i32:
3233    case INDEX_op_add_i64:
3234        return C_O1_I2(r, r, re);
3235
3236    case INDEX_op_sub_i32:
3237    case INDEX_op_sub_i64:
3238    case INDEX_op_mul_i32:
3239    case INDEX_op_mul_i64:
3240    case INDEX_op_or_i32:
3241    case INDEX_op_or_i64:
3242    case INDEX_op_xor_i32:
3243    case INDEX_op_xor_i64:
3244        return C_O1_I2(r, 0, re);
3245
3246    case INDEX_op_and_i32:
3247    case INDEX_op_and_i64:
3248        return C_O1_I2(r, 0, reZ);
3249
3250    case INDEX_op_andc_i32:
3251    case INDEX_op_andc_i64:
3252        return C_O1_I2(r, r, rI);
3253
3254    case INDEX_op_shl_i32:
3255    case INDEX_op_shl_i64:
3256    case INDEX_op_shr_i32:
3257    case INDEX_op_shr_i64:
3258    case INDEX_op_sar_i32:
3259    case INDEX_op_sar_i64:
3260        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3261
3262    case INDEX_op_rotl_i32:
3263    case INDEX_op_rotl_i64:
3264    case INDEX_op_rotr_i32:
3265    case INDEX_op_rotr_i64:
3266        return C_O1_I2(r, 0, ci);
3267
3268    case INDEX_op_brcond_i32:
3269    case INDEX_op_brcond_i64:
3270        return C_O0_I2(r, re);
3271
3272    case INDEX_op_bswap16_i32:
3273    case INDEX_op_bswap16_i64:
3274    case INDEX_op_bswap32_i32:
3275    case INDEX_op_bswap32_i64:
3276    case INDEX_op_bswap64_i64:
3277    case INDEX_op_neg_i32:
3278    case INDEX_op_neg_i64:
3279    case INDEX_op_not_i32:
3280    case INDEX_op_not_i64:
3281    case INDEX_op_extrh_i64_i32:
3282        return C_O1_I1(r, 0);
3283
3284    case INDEX_op_ext8s_i32:
3285    case INDEX_op_ext8s_i64:
3286    case INDEX_op_ext8u_i32:
3287    case INDEX_op_ext8u_i64:
3288        return C_O1_I1(r, q);
3289
3290    case INDEX_op_ext16s_i32:
3291    case INDEX_op_ext16s_i64:
3292    case INDEX_op_ext16u_i32:
3293    case INDEX_op_ext16u_i64:
3294    case INDEX_op_ext32s_i64:
3295    case INDEX_op_ext32u_i64:
3296    case INDEX_op_ext_i32_i64:
3297    case INDEX_op_extu_i32_i64:
3298    case INDEX_op_extrl_i64_i32:
3299    case INDEX_op_extract_i32:
3300    case INDEX_op_extract_i64:
3301    case INDEX_op_sextract_i32:
3302    case INDEX_op_ctpop_i32:
3303    case INDEX_op_ctpop_i64:
3304        return C_O1_I1(r, r);
3305
3306    case INDEX_op_extract2_i32:
3307    case INDEX_op_extract2_i64:
3308        return C_O1_I2(r, 0, r);
3309
3310    case INDEX_op_deposit_i32:
3311    case INDEX_op_deposit_i64:
3312        return C_O1_I2(Q, 0, Q);
3313
3314    case INDEX_op_setcond_i32:
3315    case INDEX_op_setcond_i64:
3316        return C_O1_I2(q, r, re);
3317
3318    case INDEX_op_movcond_i32:
3319    case INDEX_op_movcond_i64:
3320        return C_O1_I4(r, r, re, r, 0);
3321
3322    case INDEX_op_div2_i32:
3323    case INDEX_op_div2_i64:
3324    case INDEX_op_divu2_i32:
3325    case INDEX_op_divu2_i64:
3326        return C_O2_I3(a, d, 0, 1, r);
3327
3328    case INDEX_op_mulu2_i32:
3329    case INDEX_op_mulu2_i64:
3330    case INDEX_op_muls2_i32:
3331    case INDEX_op_muls2_i64:
3332        return C_O2_I2(a, d, a, r);
3333
3334    case INDEX_op_add2_i32:
3335    case INDEX_op_add2_i64:
3336    case INDEX_op_sub2_i32:
3337    case INDEX_op_sub2_i64:
3338        return C_O2_I4(r, r, 0, 1, re, re);
3339
3340    case INDEX_op_ctz_i32:
3341    case INDEX_op_ctz_i64:
3342        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3343
3344    case INDEX_op_clz_i32:
3345    case INDEX_op_clz_i64:
3346        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3347
3348    case INDEX_op_qemu_ld_a32_i32:
3349        return C_O1_I1(r, L);
3350    case INDEX_op_qemu_ld_a64_i32:
3351        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3352
3353    case INDEX_op_qemu_st_a32_i32:
3354        return C_O0_I2(L, L);
3355    case INDEX_op_qemu_st_a64_i32:
3356        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3357    case INDEX_op_qemu_st8_a32_i32:
3358        return C_O0_I2(s, L);
3359    case INDEX_op_qemu_st8_a64_i32:
3360        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3361
3362    case INDEX_op_qemu_ld_a32_i64:
3363        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3364    case INDEX_op_qemu_ld_a64_i64:
3365        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3366
3367    case INDEX_op_qemu_st_a32_i64:
3368        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3369    case INDEX_op_qemu_st_a64_i64:
3370        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3371
3372    case INDEX_op_qemu_ld_a32_i128:
3373    case INDEX_op_qemu_ld_a64_i128:
3374        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3375        return C_O2_I1(r, r, L);
3376    case INDEX_op_qemu_st_a32_i128:
3377    case INDEX_op_qemu_st_a64_i128:
3378        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3379        return C_O0_I3(L, L, L);
3380
3381    case INDEX_op_brcond2_i32:
3382        return C_O0_I4(r, r, ri, ri);
3383
3384    case INDEX_op_setcond2_i32:
3385        return C_O1_I4(r, r, r, ri, ri);
3386
3387    case INDEX_op_ld_vec:
3388    case INDEX_op_dupm_vec:
3389        return C_O1_I1(x, r);
3390
3391    case INDEX_op_st_vec:
3392        return C_O0_I2(x, r);
3393
3394    case INDEX_op_add_vec:
3395    case INDEX_op_sub_vec:
3396    case INDEX_op_mul_vec:
3397    case INDEX_op_and_vec:
3398    case INDEX_op_or_vec:
3399    case INDEX_op_xor_vec:
3400    case INDEX_op_andc_vec:
3401    case INDEX_op_orc_vec:
3402    case INDEX_op_nand_vec:
3403    case INDEX_op_nor_vec:
3404    case INDEX_op_eqv_vec:
3405    case INDEX_op_ssadd_vec:
3406    case INDEX_op_usadd_vec:
3407    case INDEX_op_sssub_vec:
3408    case INDEX_op_ussub_vec:
3409    case INDEX_op_smin_vec:
3410    case INDEX_op_umin_vec:
3411    case INDEX_op_smax_vec:
3412    case INDEX_op_umax_vec:
3413    case INDEX_op_shlv_vec:
3414    case INDEX_op_shrv_vec:
3415    case INDEX_op_sarv_vec:
3416    case INDEX_op_rotlv_vec:
3417    case INDEX_op_rotrv_vec:
3418    case INDEX_op_shls_vec:
3419    case INDEX_op_shrs_vec:
3420    case INDEX_op_sars_vec:
3421    case INDEX_op_cmp_vec:
3422    case INDEX_op_x86_shufps_vec:
3423    case INDEX_op_x86_blend_vec:
3424    case INDEX_op_x86_packss_vec:
3425    case INDEX_op_x86_packus_vec:
3426    case INDEX_op_x86_vperm2i128_vec:
3427    case INDEX_op_x86_punpckl_vec:
3428    case INDEX_op_x86_punpckh_vec:
3429    case INDEX_op_x86_vpshldi_vec:
3430#if TCG_TARGET_REG_BITS == 32
3431    case INDEX_op_dup2_vec:
3432#endif
3433        return C_O1_I2(x, x, x);
3434
3435    case INDEX_op_abs_vec:
3436    case INDEX_op_dup_vec:
3437    case INDEX_op_not_vec:
3438    case INDEX_op_shli_vec:
3439    case INDEX_op_shri_vec:
3440    case INDEX_op_sari_vec:
3441    case INDEX_op_rotli_vec:
3442    case INDEX_op_x86_psrldq_vec:
3443        return C_O1_I1(x, x);
3444
3445    case INDEX_op_x86_vpshldv_vec:
3446    case INDEX_op_x86_vpshrdv_vec:
3447        return C_O1_I3(x, 0, x, x);
3448
3449    case INDEX_op_bitsel_vec:
3450    case INDEX_op_x86_vpblendvb_vec:
3451        return C_O1_I3(x, x, x, x);
3452
3453    default:
3454        g_assert_not_reached();
3455    }
3456}
3457
3458int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3459{
3460    switch (opc) {
3461    case INDEX_op_add_vec:
3462    case INDEX_op_sub_vec:
3463    case INDEX_op_and_vec:
3464    case INDEX_op_or_vec:
3465    case INDEX_op_xor_vec:
3466    case INDEX_op_andc_vec:
3467    case INDEX_op_orc_vec:
3468    case INDEX_op_nand_vec:
3469    case INDEX_op_nor_vec:
3470    case INDEX_op_eqv_vec:
3471    case INDEX_op_not_vec:
3472    case INDEX_op_bitsel_vec:
3473        return 1;
3474    case INDEX_op_cmp_vec:
3475    case INDEX_op_cmpsel_vec:
3476        return -1;
3477
3478    case INDEX_op_rotli_vec:
3479        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3480
3481    case INDEX_op_shli_vec:
3482    case INDEX_op_shri_vec:
3483        /* We must expand the operation for MO_8.  */
3484        return vece == MO_8 ? -1 : 1;
3485
3486    case INDEX_op_sari_vec:
3487        switch (vece) {
3488        case MO_8:
3489            return -1;
3490        case MO_16:
3491        case MO_32:
3492            return 1;
3493        case MO_64:
3494            if (have_avx512vl) {
3495                return 1;
3496            }
3497            /*
3498             * We can emulate this for MO_64, but it does not pay off
3499             * unless we're producing at least 4 values.
3500             */
3501            return type >= TCG_TYPE_V256 ? -1 : 0;
3502        }
3503        return 0;
3504
3505    case INDEX_op_shls_vec:
3506    case INDEX_op_shrs_vec:
3507        return vece >= MO_16;
3508    case INDEX_op_sars_vec:
3509        switch (vece) {
3510        case MO_16:
3511        case MO_32:
3512            return 1;
3513        case MO_64:
3514            return have_avx512vl;
3515        }
3516        return 0;
3517    case INDEX_op_rotls_vec:
3518        return vece >= MO_16 ? -1 : 0;
3519
3520    case INDEX_op_shlv_vec:
3521    case INDEX_op_shrv_vec:
3522        switch (vece) {
3523        case MO_16:
3524            return have_avx512bw;
3525        case MO_32:
3526        case MO_64:
3527            return have_avx2;
3528        }
3529        return 0;
3530    case INDEX_op_sarv_vec:
3531        switch (vece) {
3532        case MO_16:
3533            return have_avx512bw;
3534        case MO_32:
3535            return have_avx2;
3536        case MO_64:
3537            return have_avx512vl;
3538        }
3539        return 0;
3540    case INDEX_op_rotlv_vec:
3541    case INDEX_op_rotrv_vec:
3542        switch (vece) {
3543        case MO_16:
3544            return have_avx512vbmi2 ? -1 : 0;
3545        case MO_32:
3546        case MO_64:
3547            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3548        }
3549        return 0;
3550
3551    case INDEX_op_mul_vec:
3552        switch (vece) {
3553        case MO_8:
3554            return -1;
3555        case MO_64:
3556            return have_avx512dq;
3557        }
3558        return 1;
3559
3560    case INDEX_op_ssadd_vec:
3561    case INDEX_op_usadd_vec:
3562    case INDEX_op_sssub_vec:
3563    case INDEX_op_ussub_vec:
3564        return vece <= MO_16;
3565    case INDEX_op_smin_vec:
3566    case INDEX_op_smax_vec:
3567    case INDEX_op_umin_vec:
3568    case INDEX_op_umax_vec:
3569    case INDEX_op_abs_vec:
3570        return vece <= MO_32 || have_avx512vl;
3571
3572    default:
3573        return 0;
3574    }
3575}
3576
3577static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3578                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3579{
3580    TCGv_vec t1, t2;
3581
3582    tcg_debug_assert(vece == MO_8);
3583
3584    t1 = tcg_temp_new_vec(type);
3585    t2 = tcg_temp_new_vec(type);
3586
3587    /*
3588     * Unpack to W, shift, and repack.  Tricky bits:
3589     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3590     *     i.e. duplicate in other half of the 16-bit lane.
3591     * (2) For right-shift, add 8 so that the high half of the lane
3592     *     becomes zero.  For left-shift, and left-rotate, we must
3593     *     shift up and down again.
3594     * (3) Step 2 leaves high half zero such that PACKUSWB
3595     *     (pack with unsigned saturation) does not modify
3596     *     the quantity.
3597     */
3598    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3599              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3600    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3601              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3602
3603    if (opc != INDEX_op_rotli_vec) {
3604        imm += 8;
3605    }
3606    if (opc == INDEX_op_shri_vec) {
3607        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3608        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3609    } else {
3610        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3611        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3612        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3613        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3614    }
3615
3616    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3617              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3618    tcg_temp_free_vec(t1);
3619    tcg_temp_free_vec(t2);
3620}
3621
3622static void expand_vec_sari(TCGType type, unsigned vece,
3623                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3624{
3625    TCGv_vec t1, t2;
3626
3627    switch (vece) {
3628    case MO_8:
3629        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3630        t1 = tcg_temp_new_vec(type);
3631        t2 = tcg_temp_new_vec(type);
3632        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3633                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3634        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3635                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3636        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3637        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3638        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3639                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3640        tcg_temp_free_vec(t1);
3641        tcg_temp_free_vec(t2);
3642        break;
3643
3644    case MO_64:
3645        t1 = tcg_temp_new_vec(type);
3646        if (imm <= 32) {
3647            /*
3648             * We can emulate a small sign extend by performing an arithmetic
3649             * 32-bit shift and overwriting the high half of a 64-bit logical
3650             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3651             * does not, so we have to bound the smaller shift -- we get the
3652             * same result in the high half either way.
3653             */
3654            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3655            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3656            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3657                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3658                      tcgv_vec_arg(t1), 0xaa);
3659        } else {
3660            /* Otherwise we will need to use a compare vs 0 to produce
3661             * the sign-extend, shift and merge.
3662             */
3663            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3664                            tcg_constant_vec(type, MO_64, 0), v1);
3665            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3666            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3667            tcg_gen_or_vec(MO_64, v0, v0, t1);
3668        }
3669        tcg_temp_free_vec(t1);
3670        break;
3671
3672    default:
3673        g_assert_not_reached();
3674    }
3675}
3676
3677static void expand_vec_rotli(TCGType type, unsigned vece,
3678                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3679{
3680    TCGv_vec t;
3681
3682    if (vece == MO_8) {
3683        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3684        return;
3685    }
3686
3687    if (have_avx512vbmi2) {
3688        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3689                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3690        return;
3691    }
3692
3693    t = tcg_temp_new_vec(type);
3694    tcg_gen_shli_vec(vece, t, v1, imm);
3695    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3696    tcg_gen_or_vec(vece, v0, v0, t);
3697    tcg_temp_free_vec(t);
3698}
3699
3700static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3701                            TCGv_vec v1, TCGv_vec sh, bool right)
3702{
3703    TCGv_vec t;
3704
3705    if (have_avx512vbmi2) {
3706        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3707                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3708                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3709        return;
3710    }
3711
3712    t = tcg_temp_new_vec(type);
3713    tcg_gen_dupi_vec(vece, t, 8 << vece);
3714    tcg_gen_sub_vec(vece, t, t, sh);
3715    if (right) {
3716        tcg_gen_shlv_vec(vece, t, v1, t);
3717        tcg_gen_shrv_vec(vece, v0, v1, sh);
3718    } else {
3719        tcg_gen_shrv_vec(vece, t, v1, t);
3720        tcg_gen_shlv_vec(vece, v0, v1, sh);
3721    }
3722    tcg_gen_or_vec(vece, v0, v0, t);
3723    tcg_temp_free_vec(t);
3724}
3725
3726static void expand_vec_rotls(TCGType type, unsigned vece,
3727                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3728{
3729    TCGv_vec t = tcg_temp_new_vec(type);
3730
3731    tcg_debug_assert(vece != MO_8);
3732
3733    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3734        tcg_gen_dup_i32_vec(vece, t, lsh);
3735        if (vece >= MO_32) {
3736            tcg_gen_rotlv_vec(vece, v0, v1, t);
3737        } else {
3738            expand_vec_rotv(type, vece, v0, v1, t, false);
3739        }
3740    } else {
3741        TCGv_i32 rsh = tcg_temp_new_i32();
3742
3743        tcg_gen_neg_i32(rsh, lsh);
3744        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3745        tcg_gen_shls_vec(vece, t, v1, lsh);
3746        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3747        tcg_gen_or_vec(vece, v0, v0, t);
3748
3749        tcg_temp_free_i32(rsh);
3750    }
3751
3752    tcg_temp_free_vec(t);
3753}
3754
3755static void expand_vec_mul(TCGType type, unsigned vece,
3756                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3757{
3758    TCGv_vec t1, t2, t3, t4, zero;
3759
3760    tcg_debug_assert(vece == MO_8);
3761
3762    /*
3763     * Unpack v1 bytes to words, 0 | x.
3764     * Unpack v2 bytes to words, y | 0.
3765     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3766     * Shift logical right by 8 bits to clear the high 8 bytes before
3767     * using an unsigned saturated pack.
3768     *
3769     * The difference between the V64, V128 and V256 cases is merely how
3770     * we distribute the expansion between temporaries.
3771     */
3772    switch (type) {
3773    case TCG_TYPE_V64:
3774        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3775        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3776        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3777        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3778                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3779        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3780                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3781        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3782        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3783        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3784                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3785        tcg_temp_free_vec(t1);
3786        tcg_temp_free_vec(t2);
3787        break;
3788
3789    case TCG_TYPE_V128:
3790    case TCG_TYPE_V256:
3791        t1 = tcg_temp_new_vec(type);
3792        t2 = tcg_temp_new_vec(type);
3793        t3 = tcg_temp_new_vec(type);
3794        t4 = tcg_temp_new_vec(type);
3795        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3796        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3797                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3798        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3799                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3800        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3801                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3802        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3803                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3804        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3805        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3806        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3807        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3808        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3809                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3810        tcg_temp_free_vec(t1);
3811        tcg_temp_free_vec(t2);
3812        tcg_temp_free_vec(t3);
3813        tcg_temp_free_vec(t4);
3814        break;
3815
3816    default:
3817        g_assert_not_reached();
3818    }
3819}
3820
3821static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3822                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3823{
3824    enum {
3825        NEED_INV  = 1,
3826        NEED_SWAP = 2,
3827        NEED_BIAS = 4,
3828        NEED_UMIN = 8,
3829        NEED_UMAX = 16,
3830    };
3831    TCGv_vec t1, t2, t3;
3832    uint8_t fixup;
3833
3834    switch (cond) {
3835    case TCG_COND_EQ:
3836    case TCG_COND_GT:
3837        fixup = 0;
3838        break;
3839    case TCG_COND_NE:
3840    case TCG_COND_LE:
3841        fixup = NEED_INV;
3842        break;
3843    case TCG_COND_LT:
3844        fixup = NEED_SWAP;
3845        break;
3846    case TCG_COND_GE:
3847        fixup = NEED_SWAP | NEED_INV;
3848        break;
3849    case TCG_COND_LEU:
3850        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3851            fixup = NEED_UMIN;
3852        } else {
3853            fixup = NEED_BIAS | NEED_INV;
3854        }
3855        break;
3856    case TCG_COND_GTU:
3857        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3858            fixup = NEED_UMIN | NEED_INV;
3859        } else {
3860            fixup = NEED_BIAS;
3861        }
3862        break;
3863    case TCG_COND_GEU:
3864        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3865            fixup = NEED_UMAX;
3866        } else {
3867            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3868        }
3869        break;
3870    case TCG_COND_LTU:
3871        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3872            fixup = NEED_UMAX | NEED_INV;
3873        } else {
3874            fixup = NEED_BIAS | NEED_SWAP;
3875        }
3876        break;
3877    default:
3878        g_assert_not_reached();
3879    }
3880
3881    if (fixup & NEED_INV) {
3882        cond = tcg_invert_cond(cond);
3883    }
3884    if (fixup & NEED_SWAP) {
3885        t1 = v1, v1 = v2, v2 = t1;
3886        cond = tcg_swap_cond(cond);
3887    }
3888
3889    t1 = t2 = NULL;
3890    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3891        t1 = tcg_temp_new_vec(type);
3892        if (fixup & NEED_UMIN) {
3893            tcg_gen_umin_vec(vece, t1, v1, v2);
3894        } else {
3895            tcg_gen_umax_vec(vece, t1, v1, v2);
3896        }
3897        v2 = t1;
3898        cond = TCG_COND_EQ;
3899    } else if (fixup & NEED_BIAS) {
3900        t1 = tcg_temp_new_vec(type);
3901        t2 = tcg_temp_new_vec(type);
3902        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3903        tcg_gen_sub_vec(vece, t1, v1, t3);
3904        tcg_gen_sub_vec(vece, t2, v2, t3);
3905        v1 = t1;
3906        v2 = t2;
3907        cond = tcg_signed_cond(cond);
3908    }
3909
3910    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3911    /* Expand directly; do not recurse.  */
3912    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3913              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3914
3915    if (t1) {
3916        tcg_temp_free_vec(t1);
3917        if (t2) {
3918            tcg_temp_free_vec(t2);
3919        }
3920    }
3921    return fixup & NEED_INV;
3922}
3923
3924static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3925                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3926{
3927    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3928        tcg_gen_not_vec(vece, v0, v0);
3929    }
3930}
3931
3932static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3933                              TCGv_vec c1, TCGv_vec c2,
3934                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3935{
3936    TCGv_vec t = tcg_temp_new_vec(type);
3937
3938    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3939        /* Invert the sense of the compare by swapping arguments.  */
3940        TCGv_vec x;
3941        x = v3, v3 = v4, v4 = x;
3942    }
3943    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3944              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3945              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3946    tcg_temp_free_vec(t);
3947}
3948
3949void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3950                       TCGArg a0, ...)
3951{
3952    va_list va;
3953    TCGArg a2;
3954    TCGv_vec v0, v1, v2, v3, v4;
3955
3956    va_start(va, a0);
3957    v0 = temp_tcgv_vec(arg_temp(a0));
3958    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3959    a2 = va_arg(va, TCGArg);
3960
3961    switch (opc) {
3962    case INDEX_op_shli_vec:
3963    case INDEX_op_shri_vec:
3964        expand_vec_shi(type, vece, opc, v0, v1, a2);
3965        break;
3966
3967    case INDEX_op_sari_vec:
3968        expand_vec_sari(type, vece, v0, v1, a2);
3969        break;
3970
3971    case INDEX_op_rotli_vec:
3972        expand_vec_rotli(type, vece, v0, v1, a2);
3973        break;
3974
3975    case INDEX_op_rotls_vec:
3976        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3977        break;
3978
3979    case INDEX_op_rotlv_vec:
3980        v2 = temp_tcgv_vec(arg_temp(a2));
3981        expand_vec_rotv(type, vece, v0, v1, v2, false);
3982        break;
3983    case INDEX_op_rotrv_vec:
3984        v2 = temp_tcgv_vec(arg_temp(a2));
3985        expand_vec_rotv(type, vece, v0, v1, v2, true);
3986        break;
3987
3988    case INDEX_op_mul_vec:
3989        v2 = temp_tcgv_vec(arg_temp(a2));
3990        expand_vec_mul(type, vece, v0, v1, v2);
3991        break;
3992
3993    case INDEX_op_cmp_vec:
3994        v2 = temp_tcgv_vec(arg_temp(a2));
3995        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3996        break;
3997
3998    case INDEX_op_cmpsel_vec:
3999        v2 = temp_tcgv_vec(arg_temp(a2));
4000        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4001        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4002        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4003        break;
4004
4005    default:
4006        break;
4007    }
4008
4009    va_end(va);
4010}
4011
4012static const int tcg_target_callee_save_regs[] = {
4013#if TCG_TARGET_REG_BITS == 64
4014    TCG_REG_RBP,
4015    TCG_REG_RBX,
4016#if defined(_WIN64)
4017    TCG_REG_RDI,
4018    TCG_REG_RSI,
4019#endif
4020    TCG_REG_R12,
4021    TCG_REG_R13,
4022    TCG_REG_R14, /* Currently used for the global env. */
4023    TCG_REG_R15,
4024#else
4025    TCG_REG_EBP, /* Currently used for the global env. */
4026    TCG_REG_EBX,
4027    TCG_REG_ESI,
4028    TCG_REG_EDI,
4029#endif
4030};
4031
4032/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4033   and tcg_register_jit.  */
4034
4035#define PUSH_SIZE \
4036    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4037     * (TCG_TARGET_REG_BITS / 8))
4038
4039#define FRAME_SIZE \
4040    ((PUSH_SIZE \
4041      + TCG_STATIC_CALL_ARGS_SIZE \
4042      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4043      + TCG_TARGET_STACK_ALIGN - 1) \
4044     & ~(TCG_TARGET_STACK_ALIGN - 1))
4045
4046/* Generate global QEMU prologue and epilogue code */
4047static void tcg_target_qemu_prologue(TCGContext *s)
4048{
4049    int i, stack_addend;
4050
4051    /* TB prologue */
4052
4053    /* Reserve some stack space, also for TCG temps.  */
4054    stack_addend = FRAME_SIZE - PUSH_SIZE;
4055    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4056                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4057
4058    /* Save all callee saved registers.  */
4059    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4060        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4061    }
4062
4063#if TCG_TARGET_REG_BITS == 32
4064    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4065               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4066    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4067    /* jmp *tb.  */
4068    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4069                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4070                         + stack_addend);
4071#else
4072# if !defined(CONFIG_SOFTMMU)
4073    if (guest_base) {
4074        int seg = setup_guest_base_seg();
4075        if (seg != 0) {
4076            x86_guest_base.seg = seg;
4077        } else if (guest_base == (int32_t)guest_base) {
4078            x86_guest_base.ofs = guest_base;
4079        } else {
4080            /* Choose R12 because, as a base, it requires a SIB byte. */
4081            x86_guest_base.index = TCG_REG_R12;
4082            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4083            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4084        }
4085    }
4086# endif
4087    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4088    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4089    /* jmp *tb.  */
4090    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4091#endif
4092
4093    /*
4094     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4095     * and fall through to the rest of the epilogue.
4096     */
4097    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4098    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4099
4100    /* TB epilogue */
4101    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4102
4103    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4104
4105    if (have_avx2) {
4106        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4107    }
4108    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4109        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4110    }
4111    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4112}
4113
4114static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4115{
4116    memset(p, 0x90, count);
4117}
4118
4119static void tcg_target_init(TCGContext *s)
4120{
4121    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4122    if (TCG_TARGET_REG_BITS == 64) {
4123        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4124    }
4125    if (have_avx1) {
4126        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4127        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4128    }
4129    if (have_avx2) {
4130        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4131    }
4132
4133    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4134    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4135    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4136    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4137    if (TCG_TARGET_REG_BITS == 64) {
4138#if !defined(_WIN64)
4139        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4140        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4141#endif
4142        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4143        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4144        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4145        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4146    }
4147
4148    s->reserved_regs = 0;
4149    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4150    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4151#ifdef _WIN64
4152    /* These are call saved, and we don't save them, so don't use them. */
4153    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4154    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4155    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4156    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4157    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4158    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4159    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4160    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4161    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4162    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4163#endif
4164}
4165
4166typedef struct {
4167    DebugFrameHeader h;
4168    uint8_t fde_def_cfa[4];
4169    uint8_t fde_reg_ofs[14];
4170} DebugFrame;
4171
4172/* We're expecting a 2 byte uleb128 encoded value.  */
4173QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4174
4175#if !defined(__ELF__)
4176    /* Host machine without ELF. */
4177#elif TCG_TARGET_REG_BITS == 64
4178#define ELF_HOST_MACHINE EM_X86_64
4179static const DebugFrame debug_frame = {
4180    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4181    .h.cie.id = -1,
4182    .h.cie.version = 1,
4183    .h.cie.code_align = 1,
4184    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4185    .h.cie.return_column = 16,
4186
4187    /* Total FDE size does not include the "len" member.  */
4188    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4189
4190    .fde_def_cfa = {
4191        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4192        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4193        (FRAME_SIZE >> 7)
4194    },
4195    .fde_reg_ofs = {
4196        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4197        /* The following ordering must match tcg_target_callee_save_regs.  */
4198        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4199        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4200        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4201        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4202        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4203        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4204    }
4205};
4206#else
4207#define ELF_HOST_MACHINE EM_386
4208static const DebugFrame debug_frame = {
4209    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4210    .h.cie.id = -1,
4211    .h.cie.version = 1,
4212    .h.cie.code_align = 1,
4213    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4214    .h.cie.return_column = 8,
4215
4216    /* Total FDE size does not include the "len" member.  */
4217    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4218
4219    .fde_def_cfa = {
4220        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4221        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4222        (FRAME_SIZE >> 7)
4223    },
4224    .fde_reg_ofs = {
4225        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4226        /* The following ordering must match tcg_target_callee_save_regs.  */
4227        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4228        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4229        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4230        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4231    }
4232};
4233#endif
4234
4235#if defined(ELF_HOST_MACHINE)
4236void tcg_register_jit(const void *buf, size_t buf_size)
4237{
4238    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4239}
4240#endif
4241