xref: /qemu/tcg/i386/tcg-target.c.inc (revision 73b49878)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94#define TCG_TMP_VEC  TCG_REG_XMM5
95
96static const int tcg_target_call_iarg_regs[] = {
97#if TCG_TARGET_REG_BITS == 64
98#if defined(_WIN64)
99    TCG_REG_RCX,
100    TCG_REG_RDX,
101#else
102    TCG_REG_RDI,
103    TCG_REG_RSI,
104    TCG_REG_RDX,
105    TCG_REG_RCX,
106#endif
107    TCG_REG_R8,
108    TCG_REG_R9,
109#else
110    /* 32 bit mode uses stack based calling convention (GCC default). */
111#endif
112};
113
114static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115{
116    switch (kind) {
117    case TCG_CALL_RET_NORMAL:
118        tcg_debug_assert(slot >= 0 && slot <= 1);
119        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120#ifdef _WIN64
121    case TCG_CALL_RET_BY_VEC:
122        tcg_debug_assert(slot == 0);
123        return TCG_REG_XMM0;
124#endif
125    default:
126        g_assert_not_reached();
127    }
128}
129
130/* Constants we accept.  */
131#define TCG_CT_CONST_S32 0x100
132#define TCG_CT_CONST_U32 0x200
133#define TCG_CT_CONST_I32 0x400
134#define TCG_CT_CONST_WSZ 0x800
135
136/* Registers used with L constraint, which are the first argument
137   registers on x86_64, and two random call clobbered registers on
138   i386. */
139#if TCG_TARGET_REG_BITS == 64
140# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
142#else
143# define TCG_REG_L0 TCG_REG_EAX
144# define TCG_REG_L1 TCG_REG_EDX
145#endif
146
147#if TCG_TARGET_REG_BITS == 64
148# define ALL_GENERAL_REGS      0x0000ffffu
149# define ALL_VECTOR_REGS       0xffff0000u
150# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
151#else
152# define ALL_GENERAL_REGS      0x000000ffu
153# define ALL_VECTOR_REGS       0x00ff0000u
154# define ALL_BYTEL_REGS        0x0000000fu
155#endif
156#define SOFTMMU_RESERVE_REGS \
157    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
158
159/* For 64-bit, we always know that CMOV is available.  */
160#if TCG_TARGET_REG_BITS == 64
161# define have_cmov      true
162#else
163# define have_cmov      (cpuinfo & CPUINFO_CMOV)
164#endif
165#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
166#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
167
168static const tcg_insn_unit *tb_ret_addr;
169
170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171                        intptr_t value, intptr_t addend)
172{
173    value += addend;
174    switch(type) {
175    case R_386_PC32:
176        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
177        if (value != (int32_t)value) {
178            return false;
179        }
180        /* FALLTHRU */
181    case R_386_32:
182        tcg_patch32(code_ptr, value);
183        break;
184    case R_386_PC8:
185        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
186        if (value != (int8_t)value) {
187            return false;
188        }
189        tcg_patch8(code_ptr, value);
190        break;
191    default:
192        g_assert_not_reached();
193    }
194    return true;
195}
196
197/* test if a constant matches the constraint */
198static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
199{
200    if (ct & TCG_CT_CONST) {
201        return 1;
202    }
203    if (type == TCG_TYPE_I32) {
204        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
205            return 1;
206        }
207    } else {
208        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
209            return 1;
210        }
211        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
212            return 1;
213        }
214        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
215            return 1;
216        }
217    }
218    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
219        return 1;
220    }
221    return 0;
222}
223
224# define LOWREGMASK(x)	((x) & 7)
225
226#define P_EXT		0x100		/* 0x0f opcode prefix */
227#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
228#define P_DATA16        0x400           /* 0x66 opcode prefix */
229#define P_VEXW          0x1000          /* Set VEX.W = 1 */
230#if TCG_TARGET_REG_BITS == 64
231# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
232# define P_REXB_R       0x2000          /* REG field as byte register */
233# define P_REXB_RM      0x4000          /* R/M field as byte register */
234# define P_GS           0x8000          /* gs segment override */
235#else
236# define P_REXW		0
237# define P_REXB_R	0
238# define P_REXB_RM	0
239# define P_GS           0
240#endif
241#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
242#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
243#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
244#define P_VEXL          0x80000         /* Set VEX.L = 1 */
245#define P_EVEX          0x100000        /* Requires EVEX encoding */
246
247#define OPC_ARITH_EbIb	(0x80)
248#define OPC_ARITH_EvIz	(0x81)
249#define OPC_ARITH_EvIb	(0x83)
250#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
251#define OPC_ANDN        (0xf2 | P_EXT38)
252#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
253#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
254#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
255#define OPC_BSF         (0xbc | P_EXT)
256#define OPC_BSR         (0xbd | P_EXT)
257#define OPC_BSWAP	(0xc8 | P_EXT)
258#define OPC_CALL_Jz	(0xe8)
259#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
260#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
261#define OPC_DEC_r32	(0x48)
262#define OPC_IMUL_GvEv	(0xaf | P_EXT)
263#define OPC_IMUL_GvEvIb	(0x6b)
264#define OPC_IMUL_GvEvIz	(0x69)
265#define OPC_INC_r32	(0x40)
266#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
267#define OPC_JCC_short	(0x70)		/* ... plus condition code */
268#define OPC_JMP_long	(0xe9)
269#define OPC_JMP_short	(0xeb)
270#define OPC_LEA         (0x8d)
271#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
272#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
273#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
274#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
275#define OPC_MOVB_EvIz   (0xc6)
276#define OPC_MOVL_EvIz	(0xc7)
277#define OPC_MOVB_Ib     (0xb0)
278#define OPC_MOVL_Iv     (0xb8)
279#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
280#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
281#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
282#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
283#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
284#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
285#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
286#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
287#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
288#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
289#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
290#define OPC_MOVSBL	(0xbe | P_EXT)
291#define OPC_MOVSWL	(0xbf | P_EXT)
292#define OPC_MOVSLQ	(0x63 | P_REXW)
293#define OPC_MOVZBL	(0xb6 | P_EXT)
294#define OPC_MOVZWL	(0xb7 | P_EXT)
295#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
296#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
297#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
298#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
299#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
300#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
301#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
302#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
303#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
304#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
305#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
306#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
307#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
308#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
309#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
310#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
311#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
312#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
313#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
314#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
315#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
316#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
317#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
318#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
319#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
320#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
321#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
322#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
323#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
324#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
325#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
326#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
327#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
328#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
329#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
330#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
331#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
332#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
333#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
334#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
335#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
336#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
337#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
338#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
339#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
341#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
342#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
343#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
344#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
345#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
346#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
347#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
348#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
349#define OPC_POR         (0xeb | P_EXT | P_DATA16)
350#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
351#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
352#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
353#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
354#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
355#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
356#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
357#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
358#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
359#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
360#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
361#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
362#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
363#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
364#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
365#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
366#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
367#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
368#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
369#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
370#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
371#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
372#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
373#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
374#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
375#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
376#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
377#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
378#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
379#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
380#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
381#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
382#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
383#define OPC_POP_r32	(0x58)
384#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
385#define OPC_PUSH_r32	(0x50)
386#define OPC_PUSH_Iv	(0x68)
387#define OPC_PUSH_Ib	(0x6a)
388#define OPC_RET		(0xc3)
389#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
390#define OPC_SHIFT_1	(0xd1)
391#define OPC_SHIFT_Ib	(0xc1)
392#define OPC_SHIFT_cl	(0xd3)
393#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
394#define OPC_SHUFPS      (0xc6 | P_EXT)
395#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
396#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
397#define OPC_SHRD_Ib     (0xac | P_EXT)
398#define OPC_TESTL	(0x85)
399#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
400#define OPC_UD2         (0x0b | P_EXT)
401#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
402#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
403#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
404#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
405#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
406#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
407#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
408#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
409#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
410#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
411#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
412#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
413#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
414#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
415#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
416#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
417#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
418#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
419#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
420#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
421#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
422#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
423#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
425#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
427#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
428#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
429#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
431#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
434#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
435#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
436#define OPC_VZEROUPPER  (0x77 | P_EXT)
437#define OPC_XCHG_ax_r32	(0x90)
438#define OPC_XCHG_EvGv   (0x87)
439
440#define OPC_GRP3_Eb     (0xf6)
441#define OPC_GRP3_Ev     (0xf7)
442#define OPC_GRP5        (0xff)
443#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
444
445/* Group 1 opcode extensions for 0x80-0x83.
446   These are also used as modifiers for OPC_ARITH.  */
447#define ARITH_ADD 0
448#define ARITH_OR  1
449#define ARITH_ADC 2
450#define ARITH_SBB 3
451#define ARITH_AND 4
452#define ARITH_SUB 5
453#define ARITH_XOR 6
454#define ARITH_CMP 7
455
456/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
457#define SHIFT_ROL 0
458#define SHIFT_ROR 1
459#define SHIFT_SHL 4
460#define SHIFT_SHR 5
461#define SHIFT_SAR 7
462
463/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
464#define EXT3_TESTi 0
465#define EXT3_NOT   2
466#define EXT3_NEG   3
467#define EXT3_MUL   4
468#define EXT3_IMUL  5
469#define EXT3_DIV   6
470#define EXT3_IDIV  7
471
472/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
473#define EXT5_INC_Ev	0
474#define EXT5_DEC_Ev	1
475#define EXT5_CALLN_Ev	2
476#define EXT5_JMPN_Ev	4
477
478/* Condition codes to be added to OPC_JCC_{long,short}.  */
479#define JCC_JMP (-1)
480#define JCC_JO  0x0
481#define JCC_JNO 0x1
482#define JCC_JB  0x2
483#define JCC_JAE 0x3
484#define JCC_JE  0x4
485#define JCC_JNE 0x5
486#define JCC_JBE 0x6
487#define JCC_JA  0x7
488#define JCC_JS  0x8
489#define JCC_JNS 0x9
490#define JCC_JP  0xa
491#define JCC_JNP 0xb
492#define JCC_JL  0xc
493#define JCC_JGE 0xd
494#define JCC_JLE 0xe
495#define JCC_JG  0xf
496
497static const uint8_t tcg_cond_to_jcc[] = {
498    [TCG_COND_EQ] = JCC_JE,
499    [TCG_COND_NE] = JCC_JNE,
500    [TCG_COND_LT] = JCC_JL,
501    [TCG_COND_GE] = JCC_JGE,
502    [TCG_COND_LE] = JCC_JLE,
503    [TCG_COND_GT] = JCC_JG,
504    [TCG_COND_LTU] = JCC_JB,
505    [TCG_COND_GEU] = JCC_JAE,
506    [TCG_COND_LEU] = JCC_JBE,
507    [TCG_COND_GTU] = JCC_JA,
508};
509
510#if TCG_TARGET_REG_BITS == 64
511static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
512{
513    int rex;
514
515    if (opc & P_GS) {
516        tcg_out8(s, 0x65);
517    }
518    if (opc & P_DATA16) {
519        /* We should never be asking for both 16 and 64-bit operation.  */
520        tcg_debug_assert((opc & P_REXW) == 0);
521        tcg_out8(s, 0x66);
522    }
523    if (opc & P_SIMDF3) {
524        tcg_out8(s, 0xf3);
525    } else if (opc & P_SIMDF2) {
526        tcg_out8(s, 0xf2);
527    }
528
529    rex = 0;
530    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
531    rex |= (r & 8) >> 1;                /* REX.R */
532    rex |= (x & 8) >> 2;                /* REX.X */
533    rex |= (rm & 8) >> 3;               /* REX.B */
534
535    /* P_REXB_{R,RM} indicates that the given register is the low byte.
536       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
537       as otherwise the encoding indicates %[abcd]h.  Note that the values
538       that are ORed in merely indicate that the REX byte must be present;
539       those bits get discarded in output.  */
540    rex |= opc & (r >= 4 ? P_REXB_R : 0);
541    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
542
543    if (rex) {
544        tcg_out8(s, (uint8_t)(rex | 0x40));
545    }
546
547    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
548        tcg_out8(s, 0x0f);
549        if (opc & P_EXT38) {
550            tcg_out8(s, 0x38);
551        } else if (opc & P_EXT3A) {
552            tcg_out8(s, 0x3a);
553        }
554    }
555
556    tcg_out8(s, opc);
557}
558#else
559static void tcg_out_opc(TCGContext *s, int opc)
560{
561    if (opc & P_DATA16) {
562        tcg_out8(s, 0x66);
563    }
564    if (opc & P_SIMDF3) {
565        tcg_out8(s, 0xf3);
566    } else if (opc & P_SIMDF2) {
567        tcg_out8(s, 0xf2);
568    }
569    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
570        tcg_out8(s, 0x0f);
571        if (opc & P_EXT38) {
572            tcg_out8(s, 0x38);
573        } else if (opc & P_EXT3A) {
574            tcg_out8(s, 0x3a);
575        }
576    }
577    tcg_out8(s, opc);
578}
579/* Discard the register arguments to tcg_out_opc early, so as not to penalize
580   the 32-bit compilation paths.  This method works with all versions of gcc,
581   whereas relying on optimization may not be able to exclude them.  */
582#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
583#endif
584
585static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
586{
587    tcg_out_opc(s, opc, r, rm, 0);
588    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
589}
590
591static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
592                            int rm, int index)
593{
594    int tmp;
595
596    if (opc & P_GS) {
597        tcg_out8(s, 0x65);
598    }
599    /* Use the two byte form if possible, which cannot encode
600       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
601    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
602        && ((rm | index) & 8) == 0) {
603        /* Two byte VEX prefix.  */
604        tcg_out8(s, 0xc5);
605
606        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
607    } else {
608        /* Three byte VEX prefix.  */
609        tcg_out8(s, 0xc4);
610
611        /* VEX.m-mmmm */
612        if (opc & P_EXT3A) {
613            tmp = 3;
614        } else if (opc & P_EXT38) {
615            tmp = 2;
616        } else if (opc & P_EXT) {
617            tmp = 1;
618        } else {
619            g_assert_not_reached();
620        }
621        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
622        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
623        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
624        tcg_out8(s, tmp);
625
626        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
627    }
628
629    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
630    /* VEX.pp */
631    if (opc & P_DATA16) {
632        tmp |= 1;                          /* 0x66 */
633    } else if (opc & P_SIMDF3) {
634        tmp |= 2;                          /* 0xf3 */
635    } else if (opc & P_SIMDF2) {
636        tmp |= 3;                          /* 0xf2 */
637    }
638    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
639    tcg_out8(s, tmp);
640    tcg_out8(s, opc);
641}
642
643static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
644                             int rm, int index)
645{
646    /* The entire 4-byte evex prefix; with R' and V' set. */
647    uint32_t p = 0x08041062;
648    int mm, pp;
649
650    tcg_debug_assert(have_avx512vl);
651
652    /* EVEX.mm */
653    if (opc & P_EXT3A) {
654        mm = 3;
655    } else if (opc & P_EXT38) {
656        mm = 2;
657    } else if (opc & P_EXT) {
658        mm = 1;
659    } else {
660        g_assert_not_reached();
661    }
662
663    /* EVEX.pp */
664    if (opc & P_DATA16) {
665        pp = 1;                          /* 0x66 */
666    } else if (opc & P_SIMDF3) {
667        pp = 2;                          /* 0xf3 */
668    } else if (opc & P_SIMDF2) {
669        pp = 3;                          /* 0xf2 */
670    } else {
671        pp = 0;
672    }
673
674    p = deposit32(p, 8, 2, mm);
675    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
676    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
677    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
678    p = deposit32(p, 16, 2, pp);
679    p = deposit32(p, 19, 4, ~v);
680    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
681    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
682
683    tcg_out32(s, p);
684    tcg_out8(s, opc);
685}
686
687static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
688{
689    if (opc & P_EVEX) {
690        tcg_out_evex_opc(s, opc, r, v, rm, 0);
691    } else {
692        tcg_out_vex_opc(s, opc, r, v, rm, 0);
693    }
694    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
695}
696
697/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
698   We handle either RM and INDEX missing with a negative value.  In 64-bit
699   mode for absolute addresses, ~RM is the size of the immediate operand
700   that will follow the instruction.  */
701
702static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
703                               int shift, intptr_t offset)
704{
705    int mod, len;
706
707    if (index < 0 && rm < 0) {
708        if (TCG_TARGET_REG_BITS == 64) {
709            /* Try for a rip-relative addressing mode.  This has replaced
710               the 32-bit-mode absolute addressing encoding.  */
711            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
712            intptr_t disp = offset - pc;
713            if (disp == (int32_t)disp) {
714                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
715                tcg_out32(s, disp);
716                return;
717            }
718
719            /* Try for an absolute address encoding.  This requires the
720               use of the MODRM+SIB encoding and is therefore larger than
721               rip-relative addressing.  */
722            if (offset == (int32_t)offset) {
723                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
724                tcg_out8(s, (4 << 3) | 5);
725                tcg_out32(s, offset);
726                return;
727            }
728
729            /* ??? The memory isn't directly addressable.  */
730            g_assert_not_reached();
731        } else {
732            /* Absolute address.  */
733            tcg_out8(s, (r << 3) | 5);
734            tcg_out32(s, offset);
735            return;
736        }
737    }
738
739    /* Find the length of the immediate addend.  Note that the encoding
740       that would be used for (%ebp) indicates absolute addressing.  */
741    if (rm < 0) {
742        mod = 0, len = 4, rm = 5;
743    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
744        mod = 0, len = 0;
745    } else if (offset == (int8_t)offset) {
746        mod = 0x40, len = 1;
747    } else {
748        mod = 0x80, len = 4;
749    }
750
751    /* Use a single byte MODRM format if possible.  Note that the encoding
752       that would be used for %esp is the escape to the two byte form.  */
753    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
754        /* Single byte MODRM format.  */
755        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
756    } else {
757        /* Two byte MODRM+SIB format.  */
758
759        /* Note that the encoding that would place %esp into the index
760           field indicates no index register.  In 64-bit mode, the REX.X
761           bit counts, so %r12 can be used as the index.  */
762        if (index < 0) {
763            index = 4;
764        } else {
765            tcg_debug_assert(index != TCG_REG_ESP);
766        }
767
768        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
769        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
770    }
771
772    if (len == 1) {
773        tcg_out8(s, offset);
774    } else if (len == 4) {
775        tcg_out32(s, offset);
776    }
777}
778
779static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
780                                     int index, int shift, intptr_t offset)
781{
782    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
783    tcg_out_sib_offset(s, r, rm, index, shift, offset);
784}
785
786static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
787                                         int rm, int index, int shift,
788                                         intptr_t offset)
789{
790    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
791    tcg_out_sib_offset(s, r, rm, index, shift, offset);
792}
793
794/* A simplification of the above with no index or shift.  */
795static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
796                                        int rm, intptr_t offset)
797{
798    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
799}
800
801static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
802                                            int v, int rm, intptr_t offset)
803{
804    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
805}
806
807/* Output an opcode with an expected reference to the constant pool.  */
808static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
809{
810    tcg_out_opc(s, opc, r, 0, 0);
811    /* Absolute for 32-bit, pc-relative for 64-bit.  */
812    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
813    tcg_out32(s, 0);
814}
815
816/* Output an opcode with an expected reference to the constant pool.  */
817static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
818{
819    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
820    /* Absolute for 32-bit, pc-relative for 64-bit.  */
821    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
822    tcg_out32(s, 0);
823}
824
825/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
826static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
827{
828    /* Propagate an opcode prefix, such as P_REXW.  */
829    int ext = subop & ~0x7;
830    subop &= 0x7;
831
832    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
833}
834
835static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
836{
837    int rexw = 0;
838
839    if (arg == ret) {
840        return true;
841    }
842    switch (type) {
843    case TCG_TYPE_I64:
844        rexw = P_REXW;
845        /* fallthru */
846    case TCG_TYPE_I32:
847        if (ret < 16) {
848            if (arg < 16) {
849                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
850            } else {
851                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
852            }
853        } else {
854            if (arg < 16) {
855                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
856            } else {
857                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
858            }
859        }
860        break;
861
862    case TCG_TYPE_V64:
863        tcg_debug_assert(ret >= 16 && arg >= 16);
864        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
865        break;
866    case TCG_TYPE_V128:
867        tcg_debug_assert(ret >= 16 && arg >= 16);
868        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
869        break;
870    case TCG_TYPE_V256:
871        tcg_debug_assert(ret >= 16 && arg >= 16);
872        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
873        break;
874
875    default:
876        g_assert_not_reached();
877    }
878    return true;
879}
880
881static const int avx2_dup_insn[4] = {
882    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
883    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
884};
885
886static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
887                            TCGReg r, TCGReg a)
888{
889    if (have_avx2) {
890        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
891        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
892    } else {
893        switch (vece) {
894        case MO_8:
895            /* ??? With zero in a register, use PSHUFB.  */
896            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
897            a = r;
898            /* FALLTHRU */
899        case MO_16:
900            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
901            a = r;
902            /* FALLTHRU */
903        case MO_32:
904            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
905            /* imm8 operand: all output lanes selected from input lane 0.  */
906            tcg_out8(s, 0);
907            break;
908        case MO_64:
909            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
910            break;
911        default:
912            g_assert_not_reached();
913        }
914    }
915    return true;
916}
917
918static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
919                             TCGReg r, TCGReg base, intptr_t offset)
920{
921    if (have_avx2) {
922        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
923        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
924                                 r, 0, base, offset);
925    } else {
926        switch (vece) {
927        case MO_64:
928            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
929            break;
930        case MO_32:
931            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
932            break;
933        case MO_16:
934            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
935            tcg_out8(s, 0); /* imm8 */
936            tcg_out_dup_vec(s, type, vece, r, r);
937            break;
938        case MO_8:
939            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
940            tcg_out8(s, 0); /* imm8 */
941            tcg_out_dup_vec(s, type, vece, r, r);
942            break;
943        default:
944            g_assert_not_reached();
945        }
946    }
947    return true;
948}
949
950static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
951                             TCGReg ret, int64_t arg)
952{
953    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
954
955    if (arg == 0) {
956        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
957        return;
958    }
959    if (arg == -1) {
960        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
961        return;
962    }
963
964    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
965        if (have_avx2) {
966            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
967        } else {
968            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
969        }
970        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
971    } else {
972        if (type == TCG_TYPE_V64) {
973            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
974        } else if (have_avx2) {
975            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
976        } else {
977            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
978        }
979        if (TCG_TARGET_REG_BITS == 64) {
980            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
981        } else {
982            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
983        }
984    }
985}
986
987static void tcg_out_movi_vec(TCGContext *s, TCGType type,
988                             TCGReg ret, tcg_target_long arg)
989{
990    if (arg == 0) {
991        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
992        return;
993    }
994    if (arg == -1) {
995        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
996        return;
997    }
998
999    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1000    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1001    if (TCG_TARGET_REG_BITS == 64) {
1002        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1003    } else {
1004        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1005    }
1006}
1007
1008static void tcg_out_movi_int(TCGContext *s, TCGType type,
1009                             TCGReg ret, tcg_target_long arg)
1010{
1011    tcg_target_long diff;
1012
1013    if (arg == 0) {
1014        tgen_arithr(s, ARITH_XOR, ret, ret);
1015        return;
1016    }
1017    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1018        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1019        tcg_out32(s, arg);
1020        return;
1021    }
1022    if (arg == (int32_t)arg) {
1023        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1024        tcg_out32(s, arg);
1025        return;
1026    }
1027
1028    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1029    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1030    if (diff == (int32_t)diff) {
1031        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1032        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1033        tcg_out32(s, diff);
1034        return;
1035    }
1036
1037    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1038    tcg_out64(s, arg);
1039}
1040
1041static void tcg_out_movi(TCGContext *s, TCGType type,
1042                         TCGReg ret, tcg_target_long arg)
1043{
1044    switch (type) {
1045    case TCG_TYPE_I32:
1046#if TCG_TARGET_REG_BITS == 64
1047    case TCG_TYPE_I64:
1048#endif
1049        if (ret < 16) {
1050            tcg_out_movi_int(s, type, ret, arg);
1051        } else {
1052            tcg_out_movi_vec(s, type, ret, arg);
1053        }
1054        break;
1055    default:
1056        g_assert_not_reached();
1057    }
1058}
1059
1060static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1061{
1062    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1063    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1064    return true;
1065}
1066
1067static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1068                             tcg_target_long imm)
1069{
1070    /* This function is only used for passing structs by reference. */
1071    tcg_debug_assert(imm == (int32_t)imm);
1072    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1073}
1074
1075static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1076{
1077    if (val == (int8_t)val) {
1078        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1079        tcg_out8(s, val);
1080    } else if (val == (int32_t)val) {
1081        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1082        tcg_out32(s, val);
1083    } else {
1084        g_assert_not_reached();
1085    }
1086}
1087
1088static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1089{
1090    /* Given the strength of x86 memory ordering, we only need care for
1091       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1092       faster than "mfence", so don't bother with the sse insn.  */
1093    if (a0 & TCG_MO_ST_LD) {
1094        tcg_out8(s, 0xf0);
1095        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1096        tcg_out8(s, 0);
1097    }
1098}
1099
1100static inline void tcg_out_push(TCGContext *s, int reg)
1101{
1102    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1103}
1104
1105static inline void tcg_out_pop(TCGContext *s, int reg)
1106{
1107    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1108}
1109
1110static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1111                       TCGReg arg1, intptr_t arg2)
1112{
1113    switch (type) {
1114    case TCG_TYPE_I32:
1115        if (ret < 16) {
1116            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1117        } else {
1118            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1119        }
1120        break;
1121    case TCG_TYPE_I64:
1122        if (ret < 16) {
1123            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1124            break;
1125        }
1126        /* FALLTHRU */
1127    case TCG_TYPE_V64:
1128        /* There is no instruction that can validate 8-byte alignment.  */
1129        tcg_debug_assert(ret >= 16);
1130        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1131        break;
1132    case TCG_TYPE_V128:
1133        /*
1134         * The gvec infrastructure is asserts that v128 vector loads
1135         * and stores use a 16-byte aligned offset.  Validate that the
1136         * final pointer is aligned by using an insn that will SIGSEGV.
1137         */
1138        tcg_debug_assert(ret >= 16);
1139        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1140        break;
1141    case TCG_TYPE_V256:
1142        /*
1143         * The gvec infrastructure only requires 16-byte alignment,
1144         * so here we must use an unaligned load.
1145         */
1146        tcg_debug_assert(ret >= 16);
1147        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1148                                 ret, 0, arg1, arg2);
1149        break;
1150    default:
1151        g_assert_not_reached();
1152    }
1153}
1154
1155static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1156                       TCGReg arg1, intptr_t arg2)
1157{
1158    switch (type) {
1159    case TCG_TYPE_I32:
1160        if (arg < 16) {
1161            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1162        } else {
1163            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1164        }
1165        break;
1166    case TCG_TYPE_I64:
1167        if (arg < 16) {
1168            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1169            break;
1170        }
1171        /* FALLTHRU */
1172    case TCG_TYPE_V64:
1173        /* There is no instruction that can validate 8-byte alignment.  */
1174        tcg_debug_assert(arg >= 16);
1175        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1176        break;
1177    case TCG_TYPE_V128:
1178        /*
1179         * The gvec infrastructure is asserts that v128 vector loads
1180         * and stores use a 16-byte aligned offset.  Validate that the
1181         * final pointer is aligned by using an insn that will SIGSEGV.
1182         *
1183         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1184         * for _WIN64, which must have SSE2 but may not have AVX.
1185         */
1186        tcg_debug_assert(arg >= 16);
1187        if (have_avx1) {
1188            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1189        } else {
1190            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1191        }
1192        break;
1193    case TCG_TYPE_V256:
1194        /*
1195         * The gvec infrastructure only requires 16-byte alignment,
1196         * so here we must use an unaligned store.
1197         */
1198        tcg_debug_assert(arg >= 16);
1199        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1200                                 arg, 0, arg1, arg2);
1201        break;
1202    default:
1203        g_assert_not_reached();
1204    }
1205}
1206
1207static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1208                        TCGReg base, intptr_t ofs)
1209{
1210    int rexw = 0;
1211    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1212        if (val != (int32_t)val) {
1213            return false;
1214        }
1215        rexw = P_REXW;
1216    } else if (type != TCG_TYPE_I32) {
1217        return false;
1218    }
1219    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1220    tcg_out32(s, val);
1221    return true;
1222}
1223
1224static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1225{
1226    /* Propagate an opcode prefix, such as P_DATA16.  */
1227    int ext = subopc & ~0x7;
1228    subopc &= 0x7;
1229
1230    if (count == 1) {
1231        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1232    } else {
1233        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1234        tcg_out8(s, count);
1235    }
1236}
1237
1238static inline void tcg_out_bswap32(TCGContext *s, int reg)
1239{
1240    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1241}
1242
1243static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1244{
1245    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1246}
1247
1248static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1249{
1250    /* movzbl */
1251    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1252    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1253}
1254
1255static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1256{
1257    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1258    /* movsbl */
1259    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1260    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1261}
1262
1263static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1264{
1265    /* movzwl */
1266    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1267}
1268
1269static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1270{
1271    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1272    /* movsw[lq] */
1273    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1274}
1275
1276static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1277{
1278    /* 32-bit mov zero extends.  */
1279    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1280}
1281
1282static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1283{
1284    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1285    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1286}
1287
1288static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1289{
1290    tcg_out_ext32s(s, dest, src);
1291}
1292
1293static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1294{
1295    if (dest != src) {
1296        tcg_out_ext32u(s, dest, src);
1297    }
1298}
1299
1300static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1301{
1302    tcg_out_ext32u(s, dest, src);
1303}
1304
1305static inline void tcg_out_bswap64(TCGContext *s, int reg)
1306{
1307    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1308}
1309
1310static void tgen_arithi(TCGContext *s, int c, int r0,
1311                        tcg_target_long val, int cf)
1312{
1313    int rexw = 0;
1314
1315    if (TCG_TARGET_REG_BITS == 64) {
1316        rexw = c & -8;
1317        c &= 7;
1318    }
1319
1320    switch (c) {
1321    case ARITH_ADD:
1322    case ARITH_SUB:
1323        if (!cf) {
1324            /*
1325             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1326             * partial flags update stalls on Pentium4 and are not recommended
1327             * by current Intel optimization manuals.
1328             */
1329            if (val == 1 || val == -1) {
1330                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1331                if (TCG_TARGET_REG_BITS == 64) {
1332                    /*
1333                     * The single-byte increment encodings are re-tasked
1334                     * as the REX prefixes.  Use the MODRM encoding.
1335                     */
1336                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1337                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1338                } else {
1339                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1340                }
1341                return;
1342            }
1343            if (val == 128) {
1344                /*
1345                 * Facilitate using an 8-bit immediate.  Carry is inverted
1346                 * by this transformation, so do it only if cf == 0.
1347                 */
1348                c ^= ARITH_ADD ^ ARITH_SUB;
1349                val = -128;
1350            }
1351        }
1352        break;
1353
1354    case ARITH_AND:
1355        if (TCG_TARGET_REG_BITS == 64) {
1356            if (val == 0xffffffffu) {
1357                tcg_out_ext32u(s, r0, r0);
1358                return;
1359            }
1360            if (val == (uint32_t)val) {
1361                /* AND with no high bits set can use a 32-bit operation.  */
1362                rexw = 0;
1363            }
1364        }
1365        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1366            tcg_out_ext8u(s, r0, r0);
1367            return;
1368        }
1369        if (val == 0xffffu) {
1370            tcg_out_ext16u(s, r0, r0);
1371            return;
1372        }
1373        break;
1374
1375    case ARITH_OR:
1376    case ARITH_XOR:
1377        if (val >= 0x80 && val <= 0xff
1378            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1379            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1380            tcg_out8(s, val);
1381            return;
1382        }
1383        break;
1384    }
1385
1386    if (val == (int8_t)val) {
1387        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1388        tcg_out8(s, val);
1389        return;
1390    }
1391    if (rexw == 0 || val == (int32_t)val) {
1392        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1393        tcg_out32(s, val);
1394        return;
1395    }
1396
1397    g_assert_not_reached();
1398}
1399
1400static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1401{
1402    if (val != 0) {
1403        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1404    }
1405}
1406
1407/* Set SMALL to force a short forward branch.  */
1408static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1409{
1410    int32_t val, val1;
1411
1412    if (l->has_value) {
1413        val = tcg_pcrel_diff(s, l->u.value_ptr);
1414        val1 = val - 2;
1415        if ((int8_t)val1 == val1) {
1416            if (opc == -1) {
1417                tcg_out8(s, OPC_JMP_short);
1418            } else {
1419                tcg_out8(s, OPC_JCC_short + opc);
1420            }
1421            tcg_out8(s, val1);
1422        } else {
1423            tcg_debug_assert(!small);
1424            if (opc == -1) {
1425                tcg_out8(s, OPC_JMP_long);
1426                tcg_out32(s, val - 5);
1427            } else {
1428                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1429                tcg_out32(s, val - 6);
1430            }
1431        }
1432    } else if (small) {
1433        if (opc == -1) {
1434            tcg_out8(s, OPC_JMP_short);
1435        } else {
1436            tcg_out8(s, OPC_JCC_short + opc);
1437        }
1438        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1439        s->code_ptr += 1;
1440    } else {
1441        if (opc == -1) {
1442            tcg_out8(s, OPC_JMP_long);
1443        } else {
1444            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1445        }
1446        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1447        s->code_ptr += 4;
1448    }
1449}
1450
1451static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1452                        int const_arg2, int rexw)
1453{
1454    if (const_arg2) {
1455        if (arg2 == 0) {
1456            /* test r, r */
1457            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1458        } else {
1459            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1460        }
1461    } else {
1462        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1463    }
1464}
1465
1466static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1467                           TCGArg arg1, TCGArg arg2, int const_arg2,
1468                           TCGLabel *label, bool small)
1469{
1470    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1471    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1472}
1473
1474#if TCG_TARGET_REG_BITS == 32
1475static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1476                            const int *const_args, bool small)
1477{
1478    TCGLabel *label_next = gen_new_label();
1479    TCGLabel *label_this = arg_label(args[5]);
1480
1481    switch(args[4]) {
1482    case TCG_COND_EQ:
1483        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1484                       label_next, 1);
1485        tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1486                       label_this, small);
1487        break;
1488    case TCG_COND_NE:
1489        tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1490                       label_this, small);
1491        tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1492                       label_this, small);
1493        break;
1494    case TCG_COND_LT:
1495        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1496                       label_this, small);
1497        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1498        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1499                       label_this, small);
1500        break;
1501    case TCG_COND_LE:
1502        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1503                       label_this, small);
1504        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1505        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1506                       label_this, small);
1507        break;
1508    case TCG_COND_GT:
1509        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1510                       label_this, small);
1511        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1512        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1513                       label_this, small);
1514        break;
1515    case TCG_COND_GE:
1516        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1517                       label_this, small);
1518        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1519        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1520                       label_this, small);
1521        break;
1522    case TCG_COND_LTU:
1523        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1524                       label_this, small);
1525        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1526        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1527                       label_this, small);
1528        break;
1529    case TCG_COND_LEU:
1530        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1531                       label_this, small);
1532        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1533        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1534                       label_this, small);
1535        break;
1536    case TCG_COND_GTU:
1537        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1538                       label_this, small);
1539        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1540        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1541                       label_this, small);
1542        break;
1543    case TCG_COND_GEU:
1544        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1545                       label_this, small);
1546        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1547        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1548                       label_this, small);
1549        break;
1550    default:
1551        g_assert_not_reached();
1552    }
1553    tcg_out_label(s, label_next);
1554}
1555#endif
1556
1557static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1558                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1559                            int const_arg2, bool neg)
1560{
1561    bool inv = false;
1562    bool cleared;
1563
1564    switch (cond) {
1565    case TCG_COND_NE:
1566        inv = true;
1567        /* fall through */
1568    case TCG_COND_EQ:
1569        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1570        if (const_arg2 && arg2 == 0) {
1571            arg2 = 1;
1572            goto do_ltu;
1573        }
1574        break;
1575
1576    case TCG_COND_LEU:
1577        inv = true;
1578        /* fall through */
1579    case TCG_COND_GTU:
1580        /* If arg2 is a register, swap for LTU/GEU. */
1581        if (!const_arg2) {
1582            TCGReg t = arg1;
1583            arg1 = arg2;
1584            arg2 = t;
1585            goto do_ltu;
1586        }
1587        break;
1588
1589    case TCG_COND_GEU:
1590        inv = true;
1591        /* fall through */
1592    case TCG_COND_LTU:
1593    do_ltu:
1594        /*
1595         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1596         * We can then use NEG or INC to produce the desired result.
1597         * This is always smaller than the SETCC expansion.
1598         */
1599        tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1600
1601        /* X - X - C = -C = (C ? -1 : 0) */
1602        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1603        if (inv && neg) {
1604            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1605            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1606        } else if (inv) {
1607            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1608            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1609        } else if (!neg) {
1610            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1611            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1612        }
1613        return;
1614
1615    case TCG_COND_GE:
1616        inv = true;
1617        /* fall through */
1618    case TCG_COND_LT:
1619        /* If arg2 is 0, extract the sign bit. */
1620        if (const_arg2 && arg2 == 0) {
1621            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1622            if (inv) {
1623                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1624            }
1625            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1626                           dest, rexw ? 63 : 31);
1627            return;
1628        }
1629        break;
1630
1631    default:
1632        break;
1633    }
1634
1635    /*
1636     * If dest does not overlap the inputs, clearing it first is preferred.
1637     * The XOR breaks any false dependency for the low-byte write to dest,
1638     * and is also one byte smaller than MOVZBL.
1639     */
1640    cleared = false;
1641    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1642        tgen_arithr(s, ARITH_XOR, dest, dest);
1643        cleared = true;
1644    }
1645
1646    tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1647    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1648
1649    if (!cleared) {
1650        tcg_out_ext8u(s, dest, dest);
1651    }
1652    if (neg) {
1653        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1654    }
1655}
1656
1657#if TCG_TARGET_REG_BITS == 32
1658static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1659                             const int *const_args)
1660{
1661    TCGArg new_args[6];
1662    TCGLabel *label_true, *label_over;
1663
1664    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1665
1666    if (args[0] == args[1] || args[0] == args[2]
1667        || (!const_args[3] && args[0] == args[3])
1668        || (!const_args[4] && args[0] == args[4])) {
1669        /* When the destination overlaps with one of the argument
1670           registers, don't do anything tricky.  */
1671        label_true = gen_new_label();
1672        label_over = gen_new_label();
1673
1674        new_args[5] = label_arg(label_true);
1675        tcg_out_brcond2(s, new_args, const_args+1, 1);
1676
1677        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1678        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1679        tcg_out_label(s, label_true);
1680
1681        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1682        tcg_out_label(s, label_over);
1683    } else {
1684        /* When the destination does not overlap one of the arguments,
1685           clear the destination first, jump if cond false, and emit an
1686           increment in the true case.  This results in smaller code.  */
1687
1688        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1689
1690        label_over = gen_new_label();
1691        new_args[4] = tcg_invert_cond(new_args[4]);
1692        new_args[5] = label_arg(label_over);
1693        tcg_out_brcond2(s, new_args, const_args+1, 1);
1694
1695        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1696        tcg_out_label(s, label_over);
1697    }
1698}
1699#endif
1700
1701static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1702                         TCGReg dest, TCGReg v1)
1703{
1704    if (have_cmov) {
1705        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1706    } else {
1707        TCGLabel *over = gen_new_label();
1708        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1709        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1710        tcg_out_label(s, over);
1711    }
1712}
1713
1714static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1715                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1716                            TCGReg v1)
1717{
1718    tcg_out_cmp(s, c1, c2, const_c2, rexw);
1719    tcg_out_cmov(s, cond, rexw, dest, v1);
1720}
1721
1722static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1723                        TCGArg arg2, bool const_a2)
1724{
1725    if (have_bmi1) {
1726        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1727        if (const_a2) {
1728            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1729        } else {
1730            tcg_debug_assert(dest != arg2);
1731            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1732        }
1733    } else {
1734        tcg_debug_assert(dest != arg2);
1735        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1736        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1737    }
1738}
1739
1740static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1741                        TCGArg arg2, bool const_a2)
1742{
1743    if (have_lzcnt) {
1744        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1745        if (const_a2) {
1746            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1747        } else {
1748            tcg_debug_assert(dest != arg2);
1749            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1750        }
1751    } else {
1752        tcg_debug_assert(!const_a2);
1753        tcg_debug_assert(dest != arg1);
1754        tcg_debug_assert(dest != arg2);
1755
1756        /* Recall that the output of BSR is the index not the count.  */
1757        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1758        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1759
1760        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1761        tcg_out_cmp(s, arg1, 0, 1, rexw);
1762        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1763    }
1764}
1765
1766static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1767{
1768    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1769
1770    if (disp == (int32_t)disp) {
1771        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1772        tcg_out32(s, disp);
1773    } else {
1774        /* rip-relative addressing into the constant pool.
1775           This is 6 + 8 = 14 bytes, as compared to using an
1776           immediate load 10 + 6 = 16 bytes, plus we may
1777           be able to re-use the pool constant for more calls.  */
1778        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1779        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1780        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1781        tcg_out32(s, 0);
1782    }
1783}
1784
1785static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1786                         const TCGHelperInfo *info)
1787{
1788    tcg_out_branch(s, 1, dest);
1789
1790#ifndef _WIN32
1791    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1792        /*
1793         * The sysv i386 abi for struct return places a reference as the
1794         * first argument of the stack, and pops that argument with the
1795         * return statement.  Since we want to retain the aligned stack
1796         * pointer for the callee, we do not want to actually push that
1797         * argument before the call but rely on the normal store to the
1798         * stack slot.  But we do need to compensate for the pop in order
1799         * to reset our correct stack pointer value.
1800         * Pushing a garbage value back onto the stack is quickest.
1801         */
1802        tcg_out_push(s, TCG_REG_EAX);
1803    }
1804#endif
1805}
1806
1807static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1808{
1809    tcg_out_branch(s, 0, dest);
1810}
1811
1812static void tcg_out_nopn(TCGContext *s, int n)
1813{
1814    int i;
1815    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1816     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1817     * duplicate prefix, and all of the interesting recent cores can
1818     * decode and discard the duplicates in a single cycle.
1819     */
1820    tcg_debug_assert(n >= 1);
1821    for (i = 1; i < n; ++i) {
1822        tcg_out8(s, 0x66);
1823    }
1824    tcg_out8(s, 0x90);
1825}
1826
1827/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1828static void __attribute__((unused))
1829tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1830{
1831    /*
1832     * This is used for testing alignment, so we can usually use testb.
1833     * For i686, we have to use testl for %esi/%edi.
1834     */
1835    if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1836        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1837        tcg_out8(s, i);
1838    } else {
1839        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1840        tcg_out32(s, i);
1841    }
1842}
1843
1844typedef struct {
1845    TCGReg base;
1846    int index;
1847    int ofs;
1848    int seg;
1849    TCGAtomAlign aa;
1850} HostAddress;
1851
1852bool tcg_target_has_memory_bswap(MemOp memop)
1853{
1854    TCGAtomAlign aa;
1855
1856    if (!have_movbe) {
1857        return false;
1858    }
1859    if ((memop & MO_SIZE) < MO_128) {
1860        return true;
1861    }
1862
1863    /*
1864     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1865     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1866     */
1867    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1868    return aa.atom < MO_128;
1869}
1870
1871/*
1872 * Because i686 has no register parameters and because x86_64 has xchg
1873 * to handle addr/data register overlap, we have placed all input arguments
1874 * before we need might need a scratch reg.
1875 *
1876 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1877 * a general-purpose scratch when we don't actually know it's available,
1878 * use the ra_gen hook to load into RAX if needed.
1879 */
1880#if TCG_TARGET_REG_BITS == 64
1881static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1882{
1883    if (arg < 0) {
1884        arg = TCG_REG_RAX;
1885    }
1886    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1887    return arg;
1888}
1889static const TCGLdstHelperParam ldst_helper_param = {
1890    .ra_gen = ldst_ra_gen
1891};
1892#else
1893static const TCGLdstHelperParam ldst_helper_param = { };
1894#endif
1895
1896static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1897                                TCGReg l, TCGReg h, TCGReg v)
1898{
1899    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1900
1901    /* vpmov{d,q} %v, %l */
1902    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1903    /* vpextr{d,q} $1, %v, %h */
1904    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1905    tcg_out8(s, 1);
1906}
1907
1908static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1909                                TCGReg v, TCGReg l, TCGReg h)
1910{
1911    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1912
1913    /* vmov{d,q} %l, %v */
1914    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1915    /* vpinsr{d,q} $1, %h, %v, %v */
1916    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1917    tcg_out8(s, 1);
1918}
1919
1920/*
1921 * Generate code for the slow path for a load at the end of block
1922 */
1923static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1924{
1925    MemOp opc = get_memop(l->oi);
1926    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1927
1928    /* resolve label address */
1929    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1930    if (label_ptr[1]) {
1931        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1932    }
1933
1934    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1935    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1936    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1937
1938    tcg_out_jmp(s, l->raddr);
1939    return true;
1940}
1941
1942/*
1943 * Generate code for the slow path for a store at the end of block
1944 */
1945static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1946{
1947    MemOp opc = get_memop(l->oi);
1948    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1949
1950    /* resolve label address */
1951    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1952    if (label_ptr[1]) {
1953        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1954    }
1955
1956    tcg_out_st_helper_args(s, l, &ldst_helper_param);
1957    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1958
1959    tcg_out_jmp(s, l->raddr);
1960    return true;
1961}
1962
1963#ifdef CONFIG_USER_ONLY
1964static HostAddress x86_guest_base = {
1965    .index = -1
1966};
1967
1968#if defined(__x86_64__) && defined(__linux__)
1969# include <asm/prctl.h>
1970# include <sys/prctl.h>
1971int arch_prctl(int code, unsigned long addr);
1972static inline int setup_guest_base_seg(void)
1973{
1974    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1975        return P_GS;
1976    }
1977    return 0;
1978}
1979#define setup_guest_base_seg  setup_guest_base_seg
1980#elif defined(__x86_64__) && \
1981      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1982# include <machine/sysarch.h>
1983static inline int setup_guest_base_seg(void)
1984{
1985    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1986        return P_GS;
1987    }
1988    return 0;
1989}
1990#define setup_guest_base_seg  setup_guest_base_seg
1991#endif
1992#else
1993# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1994#endif /* CONFIG_USER_ONLY */
1995#ifndef setup_guest_base_seg
1996# define setup_guest_base_seg()  0
1997#endif
1998
1999#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2000
2001/*
2002 * For softmmu, perform the TLB load and compare.
2003 * For useronly, perform any required alignment tests.
2004 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2005 * is required and fill in @h with the host address for the fast path.
2006 */
2007static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2008                                           TCGReg addrlo, TCGReg addrhi,
2009                                           MemOpIdx oi, bool is_ld)
2010{
2011    TCGLabelQemuLdst *ldst = NULL;
2012    MemOp opc = get_memop(oi);
2013    MemOp s_bits = opc & MO_SIZE;
2014    unsigned a_mask;
2015
2016    if (tcg_use_softmmu) {
2017        h->index = TCG_REG_L0;
2018        h->ofs = 0;
2019        h->seg = 0;
2020    } else {
2021        *h = x86_guest_base;
2022    }
2023    h->base = addrlo;
2024    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2025    a_mask = (1 << h->aa.align) - 1;
2026
2027    if (tcg_use_softmmu) {
2028        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2029                            : offsetof(CPUTLBEntry, addr_write);
2030        TCGType ttype = TCG_TYPE_I32;
2031        TCGType tlbtype = TCG_TYPE_I32;
2032        int trexw = 0, hrexw = 0, tlbrexw = 0;
2033        unsigned mem_index = get_mmuidx(oi);
2034        unsigned s_mask = (1 << s_bits) - 1;
2035        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2036        int tlb_mask;
2037
2038        ldst = new_ldst_label(s);
2039        ldst->is_ld = is_ld;
2040        ldst->oi = oi;
2041        ldst->addrlo_reg = addrlo;
2042        ldst->addrhi_reg = addrhi;
2043
2044        if (TCG_TARGET_REG_BITS == 64) {
2045            ttype = s->addr_type;
2046            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2047            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2048                hrexw = P_REXW;
2049                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2050                    tlbtype = TCG_TYPE_I64;
2051                    tlbrexw = P_REXW;
2052                }
2053            }
2054        }
2055
2056        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2057        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2058                       s->page_bits - CPU_TLB_ENTRY_BITS);
2059
2060        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2061                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2062
2063        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2064                             fast_ofs + offsetof(CPUTLBDescFast, table));
2065
2066        /*
2067         * If the required alignment is at least as large as the access,
2068         * simply copy the address and mask.  For lesser alignments,
2069         * check that we don't cross pages for the complete access.
2070         */
2071        if (a_mask >= s_mask) {
2072            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2073        } else {
2074            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2075                                 addrlo, s_mask - a_mask);
2076        }
2077        tlb_mask = s->page_mask | a_mask;
2078        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2079
2080        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2081        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2082                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2083
2084        /* jne slow_path */
2085        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2086        ldst->label_ptr[0] = s->code_ptr;
2087        s->code_ptr += 4;
2088
2089        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2090            /* cmp 4(TCG_REG_L0), addrhi */
2091            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2092                                 TCG_REG_L0, cmp_ofs + 4);
2093
2094            /* jne slow_path */
2095            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2096            ldst->label_ptr[1] = s->code_ptr;
2097            s->code_ptr += 4;
2098        }
2099
2100        /* TLB Hit.  */
2101        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2102                   offsetof(CPUTLBEntry, addend));
2103    } else if (a_mask) {
2104        ldst = new_ldst_label(s);
2105
2106        ldst->is_ld = is_ld;
2107        ldst->oi = oi;
2108        ldst->addrlo_reg = addrlo;
2109        ldst->addrhi_reg = addrhi;
2110
2111        tcg_out_testi(s, addrlo, a_mask);
2112        /* jne slow_path */
2113        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2114        ldst->label_ptr[0] = s->code_ptr;
2115        s->code_ptr += 4;
2116    }
2117
2118    return ldst;
2119}
2120
2121static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2122                                   HostAddress h, TCGType type, MemOp memop)
2123{
2124    bool use_movbe = false;
2125    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2126    int movop = OPC_MOVL_GvEv;
2127
2128    /* Do big-endian loads with movbe.  */
2129    if (memop & MO_BSWAP) {
2130        tcg_debug_assert(have_movbe);
2131        use_movbe = true;
2132        movop = OPC_MOVBE_GyMy;
2133    }
2134
2135    switch (memop & MO_SSIZE) {
2136    case MO_UB:
2137        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2138                                 h.base, h.index, 0, h.ofs);
2139        break;
2140    case MO_SB:
2141        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2142                                 h.base, h.index, 0, h.ofs);
2143        break;
2144    case MO_UW:
2145        if (use_movbe) {
2146            /* There is no extending movbe; only low 16-bits are modified.  */
2147            if (datalo != h.base && datalo != h.index) {
2148                /* XOR breaks dependency chains.  */
2149                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2150                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2151                                         datalo, h.base, h.index, 0, h.ofs);
2152            } else {
2153                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2154                                         datalo, h.base, h.index, 0, h.ofs);
2155                tcg_out_ext16u(s, datalo, datalo);
2156            }
2157        } else {
2158            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2159                                     h.base, h.index, 0, h.ofs);
2160        }
2161        break;
2162    case MO_SW:
2163        if (use_movbe) {
2164            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2165                                     datalo, h.base, h.index, 0, h.ofs);
2166            tcg_out_ext16s(s, type, datalo, datalo);
2167        } else {
2168            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2169                                     datalo, h.base, h.index, 0, h.ofs);
2170        }
2171        break;
2172    case MO_UL:
2173        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2174                                 h.base, h.index, 0, h.ofs);
2175        break;
2176#if TCG_TARGET_REG_BITS == 64
2177    case MO_SL:
2178        if (use_movbe) {
2179            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2180                                     h.base, h.index, 0, h.ofs);
2181            tcg_out_ext32s(s, datalo, datalo);
2182        } else {
2183            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2184                                     h.base, h.index, 0, h.ofs);
2185        }
2186        break;
2187#endif
2188    case MO_UQ:
2189        if (TCG_TARGET_REG_BITS == 64) {
2190            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2191                                     h.base, h.index, 0, h.ofs);
2192            break;
2193        }
2194        if (use_movbe) {
2195            TCGReg t = datalo;
2196            datalo = datahi;
2197            datahi = t;
2198        }
2199        if (h.base == datalo || h.index == datalo) {
2200            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2201                                     h.base, h.index, 0, h.ofs);
2202            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2203            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2204        } else {
2205            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2206                                     h.base, h.index, 0, h.ofs);
2207            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2208                                     h.base, h.index, 0, h.ofs + 4);
2209        }
2210        break;
2211
2212    case MO_128:
2213        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2214
2215        /*
2216         * Without 16-byte atomicity, use integer regs.
2217         * That is where we want the data, and it allows bswaps.
2218         */
2219        if (h.aa.atom < MO_128) {
2220            if (use_movbe) {
2221                TCGReg t = datalo;
2222                datalo = datahi;
2223                datahi = t;
2224            }
2225            if (h.base == datalo || h.index == datalo) {
2226                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2227                                         h.base, h.index, 0, h.ofs);
2228                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2229                                     datalo, datahi, 0);
2230                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2231                                     datahi, datahi, 8);
2232            } else {
2233                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2234                                         h.base, h.index, 0, h.ofs);
2235                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2236                                         h.base, h.index, 0, h.ofs + 8);
2237            }
2238            break;
2239        }
2240
2241        /*
2242         * With 16-byte atomicity, a vector load is required.
2243         * If we already have 16-byte alignment, then VMOVDQA always works.
2244         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2245         * Else use we require a runtime test for alignment for VMOVDQA;
2246         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2247         */
2248        if (h.aa.align >= MO_128) {
2249            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2250                                         TCG_TMP_VEC, 0,
2251                                         h.base, h.index, 0, h.ofs);
2252        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2253            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2254                                         TCG_TMP_VEC, 0,
2255                                         h.base, h.index, 0, h.ofs);
2256        } else {
2257            TCGLabel *l1 = gen_new_label();
2258            TCGLabel *l2 = gen_new_label();
2259
2260            tcg_out_testi(s, h.base, 15);
2261            tcg_out_jxx(s, JCC_JNE, l1, true);
2262
2263            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2264                                         TCG_TMP_VEC, 0,
2265                                         h.base, h.index, 0, h.ofs);
2266            tcg_out_jxx(s, JCC_JMP, l2, true);
2267
2268            tcg_out_label(s, l1);
2269            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2270                                         TCG_TMP_VEC, 0,
2271                                         h.base, h.index, 0, h.ofs);
2272            tcg_out_label(s, l2);
2273        }
2274        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2275        break;
2276
2277    default:
2278        g_assert_not_reached();
2279    }
2280}
2281
2282static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2283                            TCGReg addrlo, TCGReg addrhi,
2284                            MemOpIdx oi, TCGType data_type)
2285{
2286    TCGLabelQemuLdst *ldst;
2287    HostAddress h;
2288
2289    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2290    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2291
2292    if (ldst) {
2293        ldst->type = data_type;
2294        ldst->datalo_reg = datalo;
2295        ldst->datahi_reg = datahi;
2296        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2297    }
2298}
2299
2300static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2301                                   HostAddress h, MemOp memop)
2302{
2303    bool use_movbe = false;
2304    int movop = OPC_MOVL_EvGv;
2305
2306    /*
2307     * Do big-endian stores with movbe or system-mode.
2308     * User-only without movbe will have its swapping done generically.
2309     */
2310    if (memop & MO_BSWAP) {
2311        tcg_debug_assert(have_movbe);
2312        use_movbe = true;
2313        movop = OPC_MOVBE_MyGy;
2314    }
2315
2316    switch (memop & MO_SIZE) {
2317    case MO_8:
2318        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2319        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2320        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2321                                 datalo, h.base, h.index, 0, h.ofs);
2322        break;
2323    case MO_16:
2324        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2325                                 h.base, h.index, 0, h.ofs);
2326        break;
2327    case MO_32:
2328        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2329                                 h.base, h.index, 0, h.ofs);
2330        break;
2331    case MO_64:
2332        if (TCG_TARGET_REG_BITS == 64) {
2333            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2334                                     h.base, h.index, 0, h.ofs);
2335        } else {
2336            if (use_movbe) {
2337                TCGReg t = datalo;
2338                datalo = datahi;
2339                datahi = t;
2340            }
2341            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2342                                     h.base, h.index, 0, h.ofs);
2343            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2344                                     h.base, h.index, 0, h.ofs + 4);
2345        }
2346        break;
2347
2348    case MO_128:
2349        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2350
2351        /*
2352         * Without 16-byte atomicity, use integer regs.
2353         * That is where we have the data, and it allows bswaps.
2354         */
2355        if (h.aa.atom < MO_128) {
2356            if (use_movbe) {
2357                TCGReg t = datalo;
2358                datalo = datahi;
2359                datahi = t;
2360            }
2361            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2362                                     h.base, h.index, 0, h.ofs);
2363            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2364                                     h.base, h.index, 0, h.ofs + 8);
2365            break;
2366        }
2367
2368        /*
2369         * With 16-byte atomicity, a vector store is required.
2370         * If we already have 16-byte alignment, then VMOVDQA always works.
2371         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2372         * Else use we require a runtime test for alignment for VMOVDQA;
2373         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2374         */
2375        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2376        if (h.aa.align >= MO_128) {
2377            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2378                                         TCG_TMP_VEC, 0,
2379                                         h.base, h.index, 0, h.ofs);
2380        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2381            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2382                                         TCG_TMP_VEC, 0,
2383                                         h.base, h.index, 0, h.ofs);
2384        } else {
2385            TCGLabel *l1 = gen_new_label();
2386            TCGLabel *l2 = gen_new_label();
2387
2388            tcg_out_testi(s, h.base, 15);
2389            tcg_out_jxx(s, JCC_JNE, l1, true);
2390
2391            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2392                                         TCG_TMP_VEC, 0,
2393                                         h.base, h.index, 0, h.ofs);
2394            tcg_out_jxx(s, JCC_JMP, l2, true);
2395
2396            tcg_out_label(s, l1);
2397            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2398                                         TCG_TMP_VEC, 0,
2399                                         h.base, h.index, 0, h.ofs);
2400            tcg_out_label(s, l2);
2401        }
2402        break;
2403
2404    default:
2405        g_assert_not_reached();
2406    }
2407}
2408
2409static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2410                            TCGReg addrlo, TCGReg addrhi,
2411                            MemOpIdx oi, TCGType data_type)
2412{
2413    TCGLabelQemuLdst *ldst;
2414    HostAddress h;
2415
2416    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2417    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2418
2419    if (ldst) {
2420        ldst->type = data_type;
2421        ldst->datalo_reg = datalo;
2422        ldst->datahi_reg = datahi;
2423        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2424    }
2425}
2426
2427static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2428{
2429    /* Reuse the zeroing that exists for goto_ptr.  */
2430    if (a0 == 0) {
2431        tcg_out_jmp(s, tcg_code_gen_epilogue);
2432    } else {
2433        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2434        tcg_out_jmp(s, tb_ret_addr);
2435    }
2436}
2437
2438static void tcg_out_goto_tb(TCGContext *s, int which)
2439{
2440    /*
2441     * Jump displacement must be aligned for atomic patching;
2442     * see if we need to add extra nops before jump
2443     */
2444    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2445    if (gap != 1) {
2446        tcg_out_nopn(s, gap - 1);
2447    }
2448    tcg_out8(s, OPC_JMP_long); /* jmp im */
2449    set_jmp_insn_offset(s, which);
2450    tcg_out32(s, 0);
2451    set_jmp_reset_offset(s, which);
2452}
2453
2454void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2455                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2456{
2457    /* patch the branch destination */
2458    uintptr_t addr = tb->jmp_target_addr[n];
2459    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2460    /* no need to flush icache explicitly */
2461}
2462
2463static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2464                              const TCGArg args[TCG_MAX_OP_ARGS],
2465                              const int const_args[TCG_MAX_OP_ARGS])
2466{
2467    TCGArg a0, a1, a2;
2468    int c, const_a2, vexop, rexw = 0;
2469
2470#if TCG_TARGET_REG_BITS == 64
2471# define OP_32_64(x) \
2472        case glue(glue(INDEX_op_, x), _i64): \
2473            rexw = P_REXW; /* FALLTHRU */    \
2474        case glue(glue(INDEX_op_, x), _i32)
2475#else
2476# define OP_32_64(x) \
2477        case glue(glue(INDEX_op_, x), _i32)
2478#endif
2479
2480    /* Hoist the loads of the most common arguments.  */
2481    a0 = args[0];
2482    a1 = args[1];
2483    a2 = args[2];
2484    const_a2 = const_args[2];
2485
2486    switch (opc) {
2487    case INDEX_op_goto_ptr:
2488        /* jmp to the given host address (could be epilogue) */
2489        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2490        break;
2491    case INDEX_op_br:
2492        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2493        break;
2494    OP_32_64(ld8u):
2495        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2496        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2497        break;
2498    OP_32_64(ld8s):
2499        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2500        break;
2501    OP_32_64(ld16u):
2502        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2503        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2504        break;
2505    OP_32_64(ld16s):
2506        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2507        break;
2508#if TCG_TARGET_REG_BITS == 64
2509    case INDEX_op_ld32u_i64:
2510#endif
2511    case INDEX_op_ld_i32:
2512        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2513        break;
2514
2515    OP_32_64(st8):
2516        if (const_args[0]) {
2517            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2518            tcg_out8(s, a0);
2519        } else {
2520            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2521        }
2522        break;
2523    OP_32_64(st16):
2524        if (const_args[0]) {
2525            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2526            tcg_out16(s, a0);
2527        } else {
2528            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2529        }
2530        break;
2531#if TCG_TARGET_REG_BITS == 64
2532    case INDEX_op_st32_i64:
2533#endif
2534    case INDEX_op_st_i32:
2535        if (const_args[0]) {
2536            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2537            tcg_out32(s, a0);
2538        } else {
2539            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2540        }
2541        break;
2542
2543    OP_32_64(add):
2544        /* For 3-operand addition, use LEA.  */
2545        if (a0 != a1) {
2546            TCGArg c3 = 0;
2547            if (const_a2) {
2548                c3 = a2, a2 = -1;
2549            } else if (a0 == a2) {
2550                /* Watch out for dest = src + dest, since we've removed
2551                   the matching constraint on the add.  */
2552                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2553                break;
2554            }
2555
2556            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2557            break;
2558        }
2559        c = ARITH_ADD;
2560        goto gen_arith;
2561    OP_32_64(sub):
2562        c = ARITH_SUB;
2563        goto gen_arith;
2564    OP_32_64(and):
2565        c = ARITH_AND;
2566        goto gen_arith;
2567    OP_32_64(or):
2568        c = ARITH_OR;
2569        goto gen_arith;
2570    OP_32_64(xor):
2571        c = ARITH_XOR;
2572        goto gen_arith;
2573    gen_arith:
2574        if (const_a2) {
2575            tgen_arithi(s, c + rexw, a0, a2, 0);
2576        } else {
2577            tgen_arithr(s, c + rexw, a0, a2);
2578        }
2579        break;
2580
2581    OP_32_64(andc):
2582        if (const_a2) {
2583            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2584            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2585        } else {
2586            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2587        }
2588        break;
2589
2590    OP_32_64(mul):
2591        if (const_a2) {
2592            int32_t val;
2593            val = a2;
2594            if (val == (int8_t)val) {
2595                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2596                tcg_out8(s, val);
2597            } else {
2598                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2599                tcg_out32(s, val);
2600            }
2601        } else {
2602            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2603        }
2604        break;
2605
2606    OP_32_64(div2):
2607        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2608        break;
2609    OP_32_64(divu2):
2610        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2611        break;
2612
2613    OP_32_64(shl):
2614        /* For small constant 3-operand shift, use LEA.  */
2615        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2616            if (a2 - 1 == 0) {
2617                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2618                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2619            } else {
2620                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2621                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2622            }
2623            break;
2624        }
2625        c = SHIFT_SHL;
2626        vexop = OPC_SHLX;
2627        goto gen_shift_maybe_vex;
2628    OP_32_64(shr):
2629        c = SHIFT_SHR;
2630        vexop = OPC_SHRX;
2631        goto gen_shift_maybe_vex;
2632    OP_32_64(sar):
2633        c = SHIFT_SAR;
2634        vexop = OPC_SARX;
2635        goto gen_shift_maybe_vex;
2636    OP_32_64(rotl):
2637        c = SHIFT_ROL;
2638        goto gen_shift;
2639    OP_32_64(rotr):
2640        c = SHIFT_ROR;
2641        goto gen_shift;
2642    gen_shift_maybe_vex:
2643        if (have_bmi2) {
2644            if (!const_a2) {
2645                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2646                break;
2647            }
2648            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2649        }
2650        /* FALLTHRU */
2651    gen_shift:
2652        if (const_a2) {
2653            tcg_out_shifti(s, c + rexw, a0, a2);
2654        } else {
2655            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2656        }
2657        break;
2658
2659    OP_32_64(ctz):
2660        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2661        break;
2662    OP_32_64(clz):
2663        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2664        break;
2665    OP_32_64(ctpop):
2666        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2667        break;
2668
2669    OP_32_64(brcond):
2670        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2671                       arg_label(args[3]), 0);
2672        break;
2673    OP_32_64(setcond):
2674        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2675        break;
2676    OP_32_64(negsetcond):
2677        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2678        break;
2679    OP_32_64(movcond):
2680        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2681        break;
2682
2683    OP_32_64(bswap16):
2684        if (a2 & TCG_BSWAP_OS) {
2685            /* Output must be sign-extended. */
2686            if (rexw) {
2687                tcg_out_bswap64(s, a0);
2688                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2689            } else {
2690                tcg_out_bswap32(s, a0);
2691                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2692            }
2693        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2694            /* Output must be zero-extended, but input isn't. */
2695            tcg_out_bswap32(s, a0);
2696            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2697        } else {
2698            tcg_out_rolw_8(s, a0);
2699        }
2700        break;
2701    OP_32_64(bswap32):
2702        tcg_out_bswap32(s, a0);
2703        if (rexw && (a2 & TCG_BSWAP_OS)) {
2704            tcg_out_ext32s(s, a0, a0);
2705        }
2706        break;
2707
2708    OP_32_64(neg):
2709        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2710        break;
2711    OP_32_64(not):
2712        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2713        break;
2714
2715    case INDEX_op_qemu_ld_a64_i32:
2716        if (TCG_TARGET_REG_BITS == 32) {
2717            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2718            break;
2719        }
2720        /* fall through */
2721    case INDEX_op_qemu_ld_a32_i32:
2722        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2723        break;
2724    case INDEX_op_qemu_ld_a32_i64:
2725        if (TCG_TARGET_REG_BITS == 64) {
2726            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2727        } else {
2728            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2729        }
2730        break;
2731    case INDEX_op_qemu_ld_a64_i64:
2732        if (TCG_TARGET_REG_BITS == 64) {
2733            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2734        } else {
2735            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2736        }
2737        break;
2738    case INDEX_op_qemu_ld_a32_i128:
2739    case INDEX_op_qemu_ld_a64_i128:
2740        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2741        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2742        break;
2743
2744    case INDEX_op_qemu_st_a64_i32:
2745    case INDEX_op_qemu_st8_a64_i32:
2746        if (TCG_TARGET_REG_BITS == 32) {
2747            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2748            break;
2749        }
2750        /* fall through */
2751    case INDEX_op_qemu_st_a32_i32:
2752    case INDEX_op_qemu_st8_a32_i32:
2753        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2754        break;
2755    case INDEX_op_qemu_st_a32_i64:
2756        if (TCG_TARGET_REG_BITS == 64) {
2757            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2758        } else {
2759            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2760        }
2761        break;
2762    case INDEX_op_qemu_st_a64_i64:
2763        if (TCG_TARGET_REG_BITS == 64) {
2764            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2765        } else {
2766            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2767        }
2768        break;
2769    case INDEX_op_qemu_st_a32_i128:
2770    case INDEX_op_qemu_st_a64_i128:
2771        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2772        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2773        break;
2774
2775    OP_32_64(mulu2):
2776        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2777        break;
2778    OP_32_64(muls2):
2779        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2780        break;
2781    OP_32_64(add2):
2782        if (const_args[4]) {
2783            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2784        } else {
2785            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2786        }
2787        if (const_args[5]) {
2788            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2789        } else {
2790            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2791        }
2792        break;
2793    OP_32_64(sub2):
2794        if (const_args[4]) {
2795            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2796        } else {
2797            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2798        }
2799        if (const_args[5]) {
2800            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2801        } else {
2802            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2803        }
2804        break;
2805
2806#if TCG_TARGET_REG_BITS == 32
2807    case INDEX_op_brcond2_i32:
2808        tcg_out_brcond2(s, args, const_args, 0);
2809        break;
2810    case INDEX_op_setcond2_i32:
2811        tcg_out_setcond2(s, args, const_args);
2812        break;
2813#else /* TCG_TARGET_REG_BITS == 64 */
2814    case INDEX_op_ld32s_i64:
2815        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2816        break;
2817    case INDEX_op_ld_i64:
2818        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2819        break;
2820    case INDEX_op_st_i64:
2821        if (const_args[0]) {
2822            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2823            tcg_out32(s, a0);
2824        } else {
2825            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2826        }
2827        break;
2828
2829    case INDEX_op_bswap64_i64:
2830        tcg_out_bswap64(s, a0);
2831        break;
2832    case INDEX_op_extrh_i64_i32:
2833        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2834        break;
2835#endif
2836
2837    OP_32_64(deposit):
2838        if (args[3] == 0 && args[4] == 8) {
2839            /* load bits 0..7 */
2840            if (const_a2) {
2841                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2842                            0, a0, 0);
2843                tcg_out8(s, a2);
2844            } else {
2845                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2846            }
2847        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2848            /* load bits 8..15 */
2849            if (const_a2) {
2850                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2851                tcg_out8(s, a2);
2852            } else {
2853                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2854            }
2855        } else if (args[3] == 0 && args[4] == 16) {
2856            /* load bits 0..15 */
2857            if (const_a2) {
2858                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2859                            0, a0, 0);
2860                tcg_out16(s, a2);
2861            } else {
2862                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2863            }
2864        } else {
2865            g_assert_not_reached();
2866        }
2867        break;
2868
2869    case INDEX_op_extract_i64:
2870        if (a2 + args[3] == 32) {
2871            /* This is a 32-bit zero-extending right shift.  */
2872            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2873            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2874            break;
2875        }
2876        /* FALLTHRU */
2877    case INDEX_op_extract_i32:
2878        /* On the off-chance that we can use the high-byte registers.
2879           Otherwise we emit the same ext16 + shift pattern that we
2880           would have gotten from the normal tcg-op.c expansion.  */
2881        tcg_debug_assert(a2 == 8 && args[3] == 8);
2882        if (a1 < 4 && a0 < 8) {
2883            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2884        } else {
2885            tcg_out_ext16u(s, a0, a1);
2886            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2887        }
2888        break;
2889
2890    case INDEX_op_sextract_i32:
2891        /* We don't implement sextract_i64, as we cannot sign-extend to
2892           64-bits without using the REX prefix that explicitly excludes
2893           access to the high-byte registers.  */
2894        tcg_debug_assert(a2 == 8 && args[3] == 8);
2895        if (a1 < 4 && a0 < 8) {
2896            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2897        } else {
2898            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2899            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2900        }
2901        break;
2902
2903    OP_32_64(extract2):
2904        /* Note that SHRD outputs to the r/m operand.  */
2905        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2906        tcg_out8(s, args[3]);
2907        break;
2908
2909    case INDEX_op_mb:
2910        tcg_out_mb(s, a0);
2911        break;
2912    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2913    case INDEX_op_mov_i64:
2914    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2915    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2916    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2917    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2918    case INDEX_op_ext8s_i64:
2919    case INDEX_op_ext8u_i32:
2920    case INDEX_op_ext8u_i64:
2921    case INDEX_op_ext16s_i32:
2922    case INDEX_op_ext16s_i64:
2923    case INDEX_op_ext16u_i32:
2924    case INDEX_op_ext16u_i64:
2925    case INDEX_op_ext32s_i64:
2926    case INDEX_op_ext32u_i64:
2927    case INDEX_op_ext_i32_i64:
2928    case INDEX_op_extu_i32_i64:
2929    case INDEX_op_extrl_i64_i32:
2930    default:
2931        g_assert_not_reached();
2932    }
2933
2934#undef OP_32_64
2935}
2936
2937static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2938                           unsigned vecl, unsigned vece,
2939                           const TCGArg args[TCG_MAX_OP_ARGS],
2940                           const int const_args[TCG_MAX_OP_ARGS])
2941{
2942    static int const add_insn[4] = {
2943        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2944    };
2945    static int const ssadd_insn[4] = {
2946        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2947    };
2948    static int const usadd_insn[4] = {
2949        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2950    };
2951    static int const sub_insn[4] = {
2952        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2953    };
2954    static int const sssub_insn[4] = {
2955        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2956    };
2957    static int const ussub_insn[4] = {
2958        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2959    };
2960    static int const mul_insn[4] = {
2961        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2962    };
2963    static int const shift_imm_insn[4] = {
2964        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2965    };
2966    static int const cmpeq_insn[4] = {
2967        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2968    };
2969    static int const cmpgt_insn[4] = {
2970        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2971    };
2972    static int const punpckl_insn[4] = {
2973        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2974    };
2975    static int const punpckh_insn[4] = {
2976        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2977    };
2978    static int const packss_insn[4] = {
2979        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2980    };
2981    static int const packus_insn[4] = {
2982        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2983    };
2984    static int const smin_insn[4] = {
2985        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2986    };
2987    static int const smax_insn[4] = {
2988        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2989    };
2990    static int const umin_insn[4] = {
2991        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2992    };
2993    static int const umax_insn[4] = {
2994        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2995    };
2996    static int const rotlv_insn[4] = {
2997        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2998    };
2999    static int const rotrv_insn[4] = {
3000        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3001    };
3002    static int const shlv_insn[4] = {
3003        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3004    };
3005    static int const shrv_insn[4] = {
3006        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3007    };
3008    static int const sarv_insn[4] = {
3009        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3010    };
3011    static int const shls_insn[4] = {
3012        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3013    };
3014    static int const shrs_insn[4] = {
3015        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3016    };
3017    static int const sars_insn[4] = {
3018        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3019    };
3020    static int const vpshldi_insn[4] = {
3021        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3022    };
3023    static int const vpshldv_insn[4] = {
3024        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3025    };
3026    static int const vpshrdv_insn[4] = {
3027        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3028    };
3029    static int const abs_insn[4] = {
3030        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3031    };
3032
3033    TCGType type = vecl + TCG_TYPE_V64;
3034    int insn, sub;
3035    TCGArg a0, a1, a2, a3;
3036
3037    a0 = args[0];
3038    a1 = args[1];
3039    a2 = args[2];
3040
3041    switch (opc) {
3042    case INDEX_op_add_vec:
3043        insn = add_insn[vece];
3044        goto gen_simd;
3045    case INDEX_op_ssadd_vec:
3046        insn = ssadd_insn[vece];
3047        goto gen_simd;
3048    case INDEX_op_usadd_vec:
3049        insn = usadd_insn[vece];
3050        goto gen_simd;
3051    case INDEX_op_sub_vec:
3052        insn = sub_insn[vece];
3053        goto gen_simd;
3054    case INDEX_op_sssub_vec:
3055        insn = sssub_insn[vece];
3056        goto gen_simd;
3057    case INDEX_op_ussub_vec:
3058        insn = ussub_insn[vece];
3059        goto gen_simd;
3060    case INDEX_op_mul_vec:
3061        insn = mul_insn[vece];
3062        goto gen_simd;
3063    case INDEX_op_and_vec:
3064        insn = OPC_PAND;
3065        goto gen_simd;
3066    case INDEX_op_or_vec:
3067        insn = OPC_POR;
3068        goto gen_simd;
3069    case INDEX_op_xor_vec:
3070        insn = OPC_PXOR;
3071        goto gen_simd;
3072    case INDEX_op_smin_vec:
3073        insn = smin_insn[vece];
3074        goto gen_simd;
3075    case INDEX_op_umin_vec:
3076        insn = umin_insn[vece];
3077        goto gen_simd;
3078    case INDEX_op_smax_vec:
3079        insn = smax_insn[vece];
3080        goto gen_simd;
3081    case INDEX_op_umax_vec:
3082        insn = umax_insn[vece];
3083        goto gen_simd;
3084    case INDEX_op_shlv_vec:
3085        insn = shlv_insn[vece];
3086        goto gen_simd;
3087    case INDEX_op_shrv_vec:
3088        insn = shrv_insn[vece];
3089        goto gen_simd;
3090    case INDEX_op_sarv_vec:
3091        insn = sarv_insn[vece];
3092        goto gen_simd;
3093    case INDEX_op_rotlv_vec:
3094        insn = rotlv_insn[vece];
3095        goto gen_simd;
3096    case INDEX_op_rotrv_vec:
3097        insn = rotrv_insn[vece];
3098        goto gen_simd;
3099    case INDEX_op_shls_vec:
3100        insn = shls_insn[vece];
3101        goto gen_simd;
3102    case INDEX_op_shrs_vec:
3103        insn = shrs_insn[vece];
3104        goto gen_simd;
3105    case INDEX_op_sars_vec:
3106        insn = sars_insn[vece];
3107        goto gen_simd;
3108    case INDEX_op_x86_punpckl_vec:
3109        insn = punpckl_insn[vece];
3110        goto gen_simd;
3111    case INDEX_op_x86_punpckh_vec:
3112        insn = punpckh_insn[vece];
3113        goto gen_simd;
3114    case INDEX_op_x86_packss_vec:
3115        insn = packss_insn[vece];
3116        goto gen_simd;
3117    case INDEX_op_x86_packus_vec:
3118        insn = packus_insn[vece];
3119        goto gen_simd;
3120    case INDEX_op_x86_vpshldv_vec:
3121        insn = vpshldv_insn[vece];
3122        a1 = a2;
3123        a2 = args[3];
3124        goto gen_simd;
3125    case INDEX_op_x86_vpshrdv_vec:
3126        insn = vpshrdv_insn[vece];
3127        a1 = a2;
3128        a2 = args[3];
3129        goto gen_simd;
3130#if TCG_TARGET_REG_BITS == 32
3131    case INDEX_op_dup2_vec:
3132        /* First merge the two 32-bit inputs to a single 64-bit element. */
3133        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3134        /* Then replicate the 64-bit elements across the rest of the vector. */
3135        if (type != TCG_TYPE_V64) {
3136            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3137        }
3138        break;
3139#endif
3140    case INDEX_op_abs_vec:
3141        insn = abs_insn[vece];
3142        a2 = a1;
3143        a1 = 0;
3144        goto gen_simd;
3145    gen_simd:
3146        tcg_debug_assert(insn != OPC_UD2);
3147        if (type == TCG_TYPE_V256) {
3148            insn |= P_VEXL;
3149        }
3150        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3151        break;
3152
3153    case INDEX_op_cmp_vec:
3154        sub = args[3];
3155        if (sub == TCG_COND_EQ) {
3156            insn = cmpeq_insn[vece];
3157        } else if (sub == TCG_COND_GT) {
3158            insn = cmpgt_insn[vece];
3159        } else {
3160            g_assert_not_reached();
3161        }
3162        goto gen_simd;
3163
3164    case INDEX_op_andc_vec:
3165        insn = OPC_PANDN;
3166        if (type == TCG_TYPE_V256) {
3167            insn |= P_VEXL;
3168        }
3169        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3170        break;
3171
3172    case INDEX_op_shli_vec:
3173        insn = shift_imm_insn[vece];
3174        sub = 6;
3175        goto gen_shift;
3176    case INDEX_op_shri_vec:
3177        insn = shift_imm_insn[vece];
3178        sub = 2;
3179        goto gen_shift;
3180    case INDEX_op_sari_vec:
3181        if (vece == MO_64) {
3182            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3183        } else {
3184            insn = shift_imm_insn[vece];
3185        }
3186        sub = 4;
3187        goto gen_shift;
3188    case INDEX_op_rotli_vec:
3189        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3190        if (vece == MO_64) {
3191            insn |= P_VEXW;
3192        }
3193        sub = 1;
3194        goto gen_shift;
3195    gen_shift:
3196        tcg_debug_assert(vece != MO_8);
3197        if (type == TCG_TYPE_V256) {
3198            insn |= P_VEXL;
3199        }
3200        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3201        tcg_out8(s, a2);
3202        break;
3203
3204    case INDEX_op_ld_vec:
3205        tcg_out_ld(s, type, a0, a1, a2);
3206        break;
3207    case INDEX_op_st_vec:
3208        tcg_out_st(s, type, a0, a1, a2);
3209        break;
3210    case INDEX_op_dupm_vec:
3211        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3212        break;
3213
3214    case INDEX_op_x86_shufps_vec:
3215        insn = OPC_SHUFPS;
3216        sub = args[3];
3217        goto gen_simd_imm8;
3218    case INDEX_op_x86_blend_vec:
3219        if (vece == MO_16) {
3220            insn = OPC_PBLENDW;
3221        } else if (vece == MO_32) {
3222            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3223        } else {
3224            g_assert_not_reached();
3225        }
3226        sub = args[3];
3227        goto gen_simd_imm8;
3228    case INDEX_op_x86_vperm2i128_vec:
3229        insn = OPC_VPERM2I128;
3230        sub = args[3];
3231        goto gen_simd_imm8;
3232    case INDEX_op_x86_vpshldi_vec:
3233        insn = vpshldi_insn[vece];
3234        sub = args[3];
3235        goto gen_simd_imm8;
3236
3237    case INDEX_op_not_vec:
3238        insn = OPC_VPTERNLOGQ;
3239        a2 = a1;
3240        sub = 0x33; /* !B */
3241        goto gen_simd_imm8;
3242    case INDEX_op_nor_vec:
3243        insn = OPC_VPTERNLOGQ;
3244        sub = 0x11; /* norCB */
3245        goto gen_simd_imm8;
3246    case INDEX_op_nand_vec:
3247        insn = OPC_VPTERNLOGQ;
3248        sub = 0x77; /* nandCB */
3249        goto gen_simd_imm8;
3250    case INDEX_op_eqv_vec:
3251        insn = OPC_VPTERNLOGQ;
3252        sub = 0x99; /* xnorCB */
3253        goto gen_simd_imm8;
3254    case INDEX_op_orc_vec:
3255        insn = OPC_VPTERNLOGQ;
3256        sub = 0xdd; /* orB!C */
3257        goto gen_simd_imm8;
3258
3259    case INDEX_op_bitsel_vec:
3260        insn = OPC_VPTERNLOGQ;
3261        a3 = args[3];
3262        if (a0 == a1) {
3263            a1 = a2;
3264            a2 = a3;
3265            sub = 0xca; /* A?B:C */
3266        } else if (a0 == a2) {
3267            a2 = a3;
3268            sub = 0xe2; /* B?A:C */
3269        } else {
3270            tcg_out_mov(s, type, a0, a3);
3271            sub = 0xb8; /* B?C:A */
3272        }
3273        goto gen_simd_imm8;
3274
3275    gen_simd_imm8:
3276        tcg_debug_assert(insn != OPC_UD2);
3277        if (type == TCG_TYPE_V256) {
3278            insn |= P_VEXL;
3279        }
3280        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3281        tcg_out8(s, sub);
3282        break;
3283
3284    case INDEX_op_x86_vpblendvb_vec:
3285        insn = OPC_VPBLENDVB;
3286        if (type == TCG_TYPE_V256) {
3287            insn |= P_VEXL;
3288        }
3289        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3290        tcg_out8(s, args[3] << 4);
3291        break;
3292
3293    case INDEX_op_x86_psrldq_vec:
3294        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3295        tcg_out8(s, a2);
3296        break;
3297
3298    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3299    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3300    default:
3301        g_assert_not_reached();
3302    }
3303}
3304
3305static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3306{
3307    switch (op) {
3308    case INDEX_op_goto_ptr:
3309        return C_O0_I1(r);
3310
3311    case INDEX_op_ld8u_i32:
3312    case INDEX_op_ld8u_i64:
3313    case INDEX_op_ld8s_i32:
3314    case INDEX_op_ld8s_i64:
3315    case INDEX_op_ld16u_i32:
3316    case INDEX_op_ld16u_i64:
3317    case INDEX_op_ld16s_i32:
3318    case INDEX_op_ld16s_i64:
3319    case INDEX_op_ld_i32:
3320    case INDEX_op_ld32u_i64:
3321    case INDEX_op_ld32s_i64:
3322    case INDEX_op_ld_i64:
3323        return C_O1_I1(r, r);
3324
3325    case INDEX_op_st8_i32:
3326    case INDEX_op_st8_i64:
3327        return C_O0_I2(qi, r);
3328
3329    case INDEX_op_st16_i32:
3330    case INDEX_op_st16_i64:
3331    case INDEX_op_st_i32:
3332    case INDEX_op_st32_i64:
3333        return C_O0_I2(ri, r);
3334
3335    case INDEX_op_st_i64:
3336        return C_O0_I2(re, r);
3337
3338    case INDEX_op_add_i32:
3339    case INDEX_op_add_i64:
3340        return C_O1_I2(r, r, re);
3341
3342    case INDEX_op_sub_i32:
3343    case INDEX_op_sub_i64:
3344    case INDEX_op_mul_i32:
3345    case INDEX_op_mul_i64:
3346    case INDEX_op_or_i32:
3347    case INDEX_op_or_i64:
3348    case INDEX_op_xor_i32:
3349    case INDEX_op_xor_i64:
3350        return C_O1_I2(r, 0, re);
3351
3352    case INDEX_op_and_i32:
3353    case INDEX_op_and_i64:
3354        return C_O1_I2(r, 0, reZ);
3355
3356    case INDEX_op_andc_i32:
3357    case INDEX_op_andc_i64:
3358        return C_O1_I2(r, r, rI);
3359
3360    case INDEX_op_shl_i32:
3361    case INDEX_op_shl_i64:
3362    case INDEX_op_shr_i32:
3363    case INDEX_op_shr_i64:
3364    case INDEX_op_sar_i32:
3365    case INDEX_op_sar_i64:
3366        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3367
3368    case INDEX_op_rotl_i32:
3369    case INDEX_op_rotl_i64:
3370    case INDEX_op_rotr_i32:
3371    case INDEX_op_rotr_i64:
3372        return C_O1_I2(r, 0, ci);
3373
3374    case INDEX_op_brcond_i32:
3375    case INDEX_op_brcond_i64:
3376        return C_O0_I2(r, re);
3377
3378    case INDEX_op_bswap16_i32:
3379    case INDEX_op_bswap16_i64:
3380    case INDEX_op_bswap32_i32:
3381    case INDEX_op_bswap32_i64:
3382    case INDEX_op_bswap64_i64:
3383    case INDEX_op_neg_i32:
3384    case INDEX_op_neg_i64:
3385    case INDEX_op_not_i32:
3386    case INDEX_op_not_i64:
3387    case INDEX_op_extrh_i64_i32:
3388        return C_O1_I1(r, 0);
3389
3390    case INDEX_op_ext8s_i32:
3391    case INDEX_op_ext8s_i64:
3392    case INDEX_op_ext8u_i32:
3393    case INDEX_op_ext8u_i64:
3394        return C_O1_I1(r, q);
3395
3396    case INDEX_op_ext16s_i32:
3397    case INDEX_op_ext16s_i64:
3398    case INDEX_op_ext16u_i32:
3399    case INDEX_op_ext16u_i64:
3400    case INDEX_op_ext32s_i64:
3401    case INDEX_op_ext32u_i64:
3402    case INDEX_op_ext_i32_i64:
3403    case INDEX_op_extu_i32_i64:
3404    case INDEX_op_extrl_i64_i32:
3405    case INDEX_op_extract_i32:
3406    case INDEX_op_extract_i64:
3407    case INDEX_op_sextract_i32:
3408    case INDEX_op_ctpop_i32:
3409    case INDEX_op_ctpop_i64:
3410        return C_O1_I1(r, r);
3411
3412    case INDEX_op_extract2_i32:
3413    case INDEX_op_extract2_i64:
3414        return C_O1_I2(r, 0, r);
3415
3416    case INDEX_op_deposit_i32:
3417    case INDEX_op_deposit_i64:
3418        return C_O1_I2(q, 0, qi);
3419
3420    case INDEX_op_setcond_i32:
3421    case INDEX_op_setcond_i64:
3422    case INDEX_op_negsetcond_i32:
3423    case INDEX_op_negsetcond_i64:
3424        return C_O1_I2(q, r, re);
3425
3426    case INDEX_op_movcond_i32:
3427    case INDEX_op_movcond_i64:
3428        return C_O1_I4(r, r, re, r, 0);
3429
3430    case INDEX_op_div2_i32:
3431    case INDEX_op_div2_i64:
3432    case INDEX_op_divu2_i32:
3433    case INDEX_op_divu2_i64:
3434        return C_O2_I3(a, d, 0, 1, r);
3435
3436    case INDEX_op_mulu2_i32:
3437    case INDEX_op_mulu2_i64:
3438    case INDEX_op_muls2_i32:
3439    case INDEX_op_muls2_i64:
3440        return C_O2_I2(a, d, a, r);
3441
3442    case INDEX_op_add2_i32:
3443    case INDEX_op_add2_i64:
3444    case INDEX_op_sub2_i32:
3445    case INDEX_op_sub2_i64:
3446        return C_N1_O1_I4(r, r, 0, 1, re, re);
3447
3448    case INDEX_op_ctz_i32:
3449    case INDEX_op_ctz_i64:
3450        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3451
3452    case INDEX_op_clz_i32:
3453    case INDEX_op_clz_i64:
3454        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3455
3456    case INDEX_op_qemu_ld_a32_i32:
3457        return C_O1_I1(r, L);
3458    case INDEX_op_qemu_ld_a64_i32:
3459        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3460
3461    case INDEX_op_qemu_st_a32_i32:
3462        return C_O0_I2(L, L);
3463    case INDEX_op_qemu_st_a64_i32:
3464        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3465    case INDEX_op_qemu_st8_a32_i32:
3466        return C_O0_I2(s, L);
3467    case INDEX_op_qemu_st8_a64_i32:
3468        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3469
3470    case INDEX_op_qemu_ld_a32_i64:
3471        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3472    case INDEX_op_qemu_ld_a64_i64:
3473        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3474
3475    case INDEX_op_qemu_st_a32_i64:
3476        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3477    case INDEX_op_qemu_st_a64_i64:
3478        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3479
3480    case INDEX_op_qemu_ld_a32_i128:
3481    case INDEX_op_qemu_ld_a64_i128:
3482        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3483        return C_O2_I1(r, r, L);
3484    case INDEX_op_qemu_st_a32_i128:
3485    case INDEX_op_qemu_st_a64_i128:
3486        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3487        return C_O0_I3(L, L, L);
3488
3489    case INDEX_op_brcond2_i32:
3490        return C_O0_I4(r, r, ri, ri);
3491
3492    case INDEX_op_setcond2_i32:
3493        return C_O1_I4(r, r, r, ri, ri);
3494
3495    case INDEX_op_ld_vec:
3496    case INDEX_op_dupm_vec:
3497        return C_O1_I1(x, r);
3498
3499    case INDEX_op_st_vec:
3500        return C_O0_I2(x, r);
3501
3502    case INDEX_op_add_vec:
3503    case INDEX_op_sub_vec:
3504    case INDEX_op_mul_vec:
3505    case INDEX_op_and_vec:
3506    case INDEX_op_or_vec:
3507    case INDEX_op_xor_vec:
3508    case INDEX_op_andc_vec:
3509    case INDEX_op_orc_vec:
3510    case INDEX_op_nand_vec:
3511    case INDEX_op_nor_vec:
3512    case INDEX_op_eqv_vec:
3513    case INDEX_op_ssadd_vec:
3514    case INDEX_op_usadd_vec:
3515    case INDEX_op_sssub_vec:
3516    case INDEX_op_ussub_vec:
3517    case INDEX_op_smin_vec:
3518    case INDEX_op_umin_vec:
3519    case INDEX_op_smax_vec:
3520    case INDEX_op_umax_vec:
3521    case INDEX_op_shlv_vec:
3522    case INDEX_op_shrv_vec:
3523    case INDEX_op_sarv_vec:
3524    case INDEX_op_rotlv_vec:
3525    case INDEX_op_rotrv_vec:
3526    case INDEX_op_shls_vec:
3527    case INDEX_op_shrs_vec:
3528    case INDEX_op_sars_vec:
3529    case INDEX_op_cmp_vec:
3530    case INDEX_op_x86_shufps_vec:
3531    case INDEX_op_x86_blend_vec:
3532    case INDEX_op_x86_packss_vec:
3533    case INDEX_op_x86_packus_vec:
3534    case INDEX_op_x86_vperm2i128_vec:
3535    case INDEX_op_x86_punpckl_vec:
3536    case INDEX_op_x86_punpckh_vec:
3537    case INDEX_op_x86_vpshldi_vec:
3538#if TCG_TARGET_REG_BITS == 32
3539    case INDEX_op_dup2_vec:
3540#endif
3541        return C_O1_I2(x, x, x);
3542
3543    case INDEX_op_abs_vec:
3544    case INDEX_op_dup_vec:
3545    case INDEX_op_not_vec:
3546    case INDEX_op_shli_vec:
3547    case INDEX_op_shri_vec:
3548    case INDEX_op_sari_vec:
3549    case INDEX_op_rotli_vec:
3550    case INDEX_op_x86_psrldq_vec:
3551        return C_O1_I1(x, x);
3552
3553    case INDEX_op_x86_vpshldv_vec:
3554    case INDEX_op_x86_vpshrdv_vec:
3555        return C_O1_I3(x, 0, x, x);
3556
3557    case INDEX_op_bitsel_vec:
3558    case INDEX_op_x86_vpblendvb_vec:
3559        return C_O1_I3(x, x, x, x);
3560
3561    default:
3562        g_assert_not_reached();
3563    }
3564}
3565
3566int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3567{
3568    switch (opc) {
3569    case INDEX_op_add_vec:
3570    case INDEX_op_sub_vec:
3571    case INDEX_op_and_vec:
3572    case INDEX_op_or_vec:
3573    case INDEX_op_xor_vec:
3574    case INDEX_op_andc_vec:
3575    case INDEX_op_orc_vec:
3576    case INDEX_op_nand_vec:
3577    case INDEX_op_nor_vec:
3578    case INDEX_op_eqv_vec:
3579    case INDEX_op_not_vec:
3580    case INDEX_op_bitsel_vec:
3581        return 1;
3582    case INDEX_op_cmp_vec:
3583    case INDEX_op_cmpsel_vec:
3584        return -1;
3585
3586    case INDEX_op_rotli_vec:
3587        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3588
3589    case INDEX_op_shli_vec:
3590    case INDEX_op_shri_vec:
3591        /* We must expand the operation for MO_8.  */
3592        return vece == MO_8 ? -1 : 1;
3593
3594    case INDEX_op_sari_vec:
3595        switch (vece) {
3596        case MO_8:
3597            return -1;
3598        case MO_16:
3599        case MO_32:
3600            return 1;
3601        case MO_64:
3602            if (have_avx512vl) {
3603                return 1;
3604            }
3605            /*
3606             * We can emulate this for MO_64, but it does not pay off
3607             * unless we're producing at least 4 values.
3608             */
3609            return type >= TCG_TYPE_V256 ? -1 : 0;
3610        }
3611        return 0;
3612
3613    case INDEX_op_shls_vec:
3614    case INDEX_op_shrs_vec:
3615        return vece >= MO_16;
3616    case INDEX_op_sars_vec:
3617        switch (vece) {
3618        case MO_16:
3619        case MO_32:
3620            return 1;
3621        case MO_64:
3622            return have_avx512vl;
3623        }
3624        return 0;
3625    case INDEX_op_rotls_vec:
3626        return vece >= MO_16 ? -1 : 0;
3627
3628    case INDEX_op_shlv_vec:
3629    case INDEX_op_shrv_vec:
3630        switch (vece) {
3631        case MO_16:
3632            return have_avx512bw;
3633        case MO_32:
3634        case MO_64:
3635            return have_avx2;
3636        }
3637        return 0;
3638    case INDEX_op_sarv_vec:
3639        switch (vece) {
3640        case MO_16:
3641            return have_avx512bw;
3642        case MO_32:
3643            return have_avx2;
3644        case MO_64:
3645            return have_avx512vl;
3646        }
3647        return 0;
3648    case INDEX_op_rotlv_vec:
3649    case INDEX_op_rotrv_vec:
3650        switch (vece) {
3651        case MO_16:
3652            return have_avx512vbmi2 ? -1 : 0;
3653        case MO_32:
3654        case MO_64:
3655            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3656        }
3657        return 0;
3658
3659    case INDEX_op_mul_vec:
3660        switch (vece) {
3661        case MO_8:
3662            return -1;
3663        case MO_64:
3664            return have_avx512dq;
3665        }
3666        return 1;
3667
3668    case INDEX_op_ssadd_vec:
3669    case INDEX_op_usadd_vec:
3670    case INDEX_op_sssub_vec:
3671    case INDEX_op_ussub_vec:
3672        return vece <= MO_16;
3673    case INDEX_op_smin_vec:
3674    case INDEX_op_smax_vec:
3675    case INDEX_op_umin_vec:
3676    case INDEX_op_umax_vec:
3677    case INDEX_op_abs_vec:
3678        return vece <= MO_32 || have_avx512vl;
3679
3680    default:
3681        return 0;
3682    }
3683}
3684
3685static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3686                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3687{
3688    TCGv_vec t1, t2;
3689
3690    tcg_debug_assert(vece == MO_8);
3691
3692    t1 = tcg_temp_new_vec(type);
3693    t2 = tcg_temp_new_vec(type);
3694
3695    /*
3696     * Unpack to W, shift, and repack.  Tricky bits:
3697     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3698     *     i.e. duplicate in other half of the 16-bit lane.
3699     * (2) For right-shift, add 8 so that the high half of the lane
3700     *     becomes zero.  For left-shift, and left-rotate, we must
3701     *     shift up and down again.
3702     * (3) Step 2 leaves high half zero such that PACKUSWB
3703     *     (pack with unsigned saturation) does not modify
3704     *     the quantity.
3705     */
3706    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3707              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3708    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3709              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3710
3711    if (opc != INDEX_op_rotli_vec) {
3712        imm += 8;
3713    }
3714    if (opc == INDEX_op_shri_vec) {
3715        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3716        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3717    } else {
3718        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3719        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3720        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3721        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3722    }
3723
3724    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3725              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3726    tcg_temp_free_vec(t1);
3727    tcg_temp_free_vec(t2);
3728}
3729
3730static void expand_vec_sari(TCGType type, unsigned vece,
3731                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3732{
3733    TCGv_vec t1, t2;
3734
3735    switch (vece) {
3736    case MO_8:
3737        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3738        t1 = tcg_temp_new_vec(type);
3739        t2 = tcg_temp_new_vec(type);
3740        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3741                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3742        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3743                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3744        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3745        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3746        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3747                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3748        tcg_temp_free_vec(t1);
3749        tcg_temp_free_vec(t2);
3750        break;
3751
3752    case MO_64:
3753        t1 = tcg_temp_new_vec(type);
3754        if (imm <= 32) {
3755            /*
3756             * We can emulate a small sign extend by performing an arithmetic
3757             * 32-bit shift and overwriting the high half of a 64-bit logical
3758             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3759             * does not, so we have to bound the smaller shift -- we get the
3760             * same result in the high half either way.
3761             */
3762            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3763            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3764            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3765                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3766                      tcgv_vec_arg(t1), 0xaa);
3767        } else {
3768            /* Otherwise we will need to use a compare vs 0 to produce
3769             * the sign-extend, shift and merge.
3770             */
3771            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3772                            tcg_constant_vec(type, MO_64, 0), v1);
3773            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3774            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3775            tcg_gen_or_vec(MO_64, v0, v0, t1);
3776        }
3777        tcg_temp_free_vec(t1);
3778        break;
3779
3780    default:
3781        g_assert_not_reached();
3782    }
3783}
3784
3785static void expand_vec_rotli(TCGType type, unsigned vece,
3786                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3787{
3788    TCGv_vec t;
3789
3790    if (vece == MO_8) {
3791        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3792        return;
3793    }
3794
3795    if (have_avx512vbmi2) {
3796        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3797                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3798        return;
3799    }
3800
3801    t = tcg_temp_new_vec(type);
3802    tcg_gen_shli_vec(vece, t, v1, imm);
3803    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3804    tcg_gen_or_vec(vece, v0, v0, t);
3805    tcg_temp_free_vec(t);
3806}
3807
3808static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3809                            TCGv_vec v1, TCGv_vec sh, bool right)
3810{
3811    TCGv_vec t;
3812
3813    if (have_avx512vbmi2) {
3814        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3815                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3816                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3817        return;
3818    }
3819
3820    t = tcg_temp_new_vec(type);
3821    tcg_gen_dupi_vec(vece, t, 8 << vece);
3822    tcg_gen_sub_vec(vece, t, t, sh);
3823    if (right) {
3824        tcg_gen_shlv_vec(vece, t, v1, t);
3825        tcg_gen_shrv_vec(vece, v0, v1, sh);
3826    } else {
3827        tcg_gen_shrv_vec(vece, t, v1, t);
3828        tcg_gen_shlv_vec(vece, v0, v1, sh);
3829    }
3830    tcg_gen_or_vec(vece, v0, v0, t);
3831    tcg_temp_free_vec(t);
3832}
3833
3834static void expand_vec_rotls(TCGType type, unsigned vece,
3835                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3836{
3837    TCGv_vec t = tcg_temp_new_vec(type);
3838
3839    tcg_debug_assert(vece != MO_8);
3840
3841    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3842        tcg_gen_dup_i32_vec(vece, t, lsh);
3843        if (vece >= MO_32) {
3844            tcg_gen_rotlv_vec(vece, v0, v1, t);
3845        } else {
3846            expand_vec_rotv(type, vece, v0, v1, t, false);
3847        }
3848    } else {
3849        TCGv_i32 rsh = tcg_temp_new_i32();
3850
3851        tcg_gen_neg_i32(rsh, lsh);
3852        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3853        tcg_gen_shls_vec(vece, t, v1, lsh);
3854        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3855        tcg_gen_or_vec(vece, v0, v0, t);
3856
3857        tcg_temp_free_i32(rsh);
3858    }
3859
3860    tcg_temp_free_vec(t);
3861}
3862
3863static void expand_vec_mul(TCGType type, unsigned vece,
3864                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3865{
3866    TCGv_vec t1, t2, t3, t4, zero;
3867
3868    tcg_debug_assert(vece == MO_8);
3869
3870    /*
3871     * Unpack v1 bytes to words, 0 | x.
3872     * Unpack v2 bytes to words, y | 0.
3873     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3874     * Shift logical right by 8 bits to clear the high 8 bytes before
3875     * using an unsigned saturated pack.
3876     *
3877     * The difference between the V64, V128 and V256 cases is merely how
3878     * we distribute the expansion between temporaries.
3879     */
3880    switch (type) {
3881    case TCG_TYPE_V64:
3882        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3883        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3884        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3885        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3886                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3887        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3888                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3889        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3890        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3891        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3892                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3893        tcg_temp_free_vec(t1);
3894        tcg_temp_free_vec(t2);
3895        break;
3896
3897    case TCG_TYPE_V128:
3898    case TCG_TYPE_V256:
3899        t1 = tcg_temp_new_vec(type);
3900        t2 = tcg_temp_new_vec(type);
3901        t3 = tcg_temp_new_vec(type);
3902        t4 = tcg_temp_new_vec(type);
3903        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3904        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3905                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3906        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3907                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3908        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3909                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3910        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3911                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3912        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3913        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3914        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3915        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3916        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3917                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3918        tcg_temp_free_vec(t1);
3919        tcg_temp_free_vec(t2);
3920        tcg_temp_free_vec(t3);
3921        tcg_temp_free_vec(t4);
3922        break;
3923
3924    default:
3925        g_assert_not_reached();
3926    }
3927}
3928
3929static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3930                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3931{
3932    enum {
3933        NEED_INV  = 1,
3934        NEED_SWAP = 2,
3935        NEED_BIAS = 4,
3936        NEED_UMIN = 8,
3937        NEED_UMAX = 16,
3938    };
3939    TCGv_vec t1, t2, t3;
3940    uint8_t fixup;
3941
3942    switch (cond) {
3943    case TCG_COND_EQ:
3944    case TCG_COND_GT:
3945        fixup = 0;
3946        break;
3947    case TCG_COND_NE:
3948    case TCG_COND_LE:
3949        fixup = NEED_INV;
3950        break;
3951    case TCG_COND_LT:
3952        fixup = NEED_SWAP;
3953        break;
3954    case TCG_COND_GE:
3955        fixup = NEED_SWAP | NEED_INV;
3956        break;
3957    case TCG_COND_LEU:
3958        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3959            fixup = NEED_UMIN;
3960        } else {
3961            fixup = NEED_BIAS | NEED_INV;
3962        }
3963        break;
3964    case TCG_COND_GTU:
3965        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3966            fixup = NEED_UMIN | NEED_INV;
3967        } else {
3968            fixup = NEED_BIAS;
3969        }
3970        break;
3971    case TCG_COND_GEU:
3972        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3973            fixup = NEED_UMAX;
3974        } else {
3975            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3976        }
3977        break;
3978    case TCG_COND_LTU:
3979        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3980            fixup = NEED_UMAX | NEED_INV;
3981        } else {
3982            fixup = NEED_BIAS | NEED_SWAP;
3983        }
3984        break;
3985    default:
3986        g_assert_not_reached();
3987    }
3988
3989    if (fixup & NEED_INV) {
3990        cond = tcg_invert_cond(cond);
3991    }
3992    if (fixup & NEED_SWAP) {
3993        t1 = v1, v1 = v2, v2 = t1;
3994        cond = tcg_swap_cond(cond);
3995    }
3996
3997    t1 = t2 = NULL;
3998    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3999        t1 = tcg_temp_new_vec(type);
4000        if (fixup & NEED_UMIN) {
4001            tcg_gen_umin_vec(vece, t1, v1, v2);
4002        } else {
4003            tcg_gen_umax_vec(vece, t1, v1, v2);
4004        }
4005        v2 = t1;
4006        cond = TCG_COND_EQ;
4007    } else if (fixup & NEED_BIAS) {
4008        t1 = tcg_temp_new_vec(type);
4009        t2 = tcg_temp_new_vec(type);
4010        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4011        tcg_gen_sub_vec(vece, t1, v1, t3);
4012        tcg_gen_sub_vec(vece, t2, v2, t3);
4013        v1 = t1;
4014        v2 = t2;
4015        cond = tcg_signed_cond(cond);
4016    }
4017
4018    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4019    /* Expand directly; do not recurse.  */
4020    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4021              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4022
4023    if (t1) {
4024        tcg_temp_free_vec(t1);
4025        if (t2) {
4026            tcg_temp_free_vec(t2);
4027        }
4028    }
4029    return fixup & NEED_INV;
4030}
4031
4032static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4033                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4034{
4035    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4036        tcg_gen_not_vec(vece, v0, v0);
4037    }
4038}
4039
4040static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4041                              TCGv_vec c1, TCGv_vec c2,
4042                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4043{
4044    TCGv_vec t = tcg_temp_new_vec(type);
4045
4046    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4047        /* Invert the sense of the compare by swapping arguments.  */
4048        TCGv_vec x;
4049        x = v3, v3 = v4, v4 = x;
4050    }
4051    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4052              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4053              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4054    tcg_temp_free_vec(t);
4055}
4056
4057void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4058                       TCGArg a0, ...)
4059{
4060    va_list va;
4061    TCGArg a2;
4062    TCGv_vec v0, v1, v2, v3, v4;
4063
4064    va_start(va, a0);
4065    v0 = temp_tcgv_vec(arg_temp(a0));
4066    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4067    a2 = va_arg(va, TCGArg);
4068
4069    switch (opc) {
4070    case INDEX_op_shli_vec:
4071    case INDEX_op_shri_vec:
4072        expand_vec_shi(type, vece, opc, v0, v1, a2);
4073        break;
4074
4075    case INDEX_op_sari_vec:
4076        expand_vec_sari(type, vece, v0, v1, a2);
4077        break;
4078
4079    case INDEX_op_rotli_vec:
4080        expand_vec_rotli(type, vece, v0, v1, a2);
4081        break;
4082
4083    case INDEX_op_rotls_vec:
4084        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4085        break;
4086
4087    case INDEX_op_rotlv_vec:
4088        v2 = temp_tcgv_vec(arg_temp(a2));
4089        expand_vec_rotv(type, vece, v0, v1, v2, false);
4090        break;
4091    case INDEX_op_rotrv_vec:
4092        v2 = temp_tcgv_vec(arg_temp(a2));
4093        expand_vec_rotv(type, vece, v0, v1, v2, true);
4094        break;
4095
4096    case INDEX_op_mul_vec:
4097        v2 = temp_tcgv_vec(arg_temp(a2));
4098        expand_vec_mul(type, vece, v0, v1, v2);
4099        break;
4100
4101    case INDEX_op_cmp_vec:
4102        v2 = temp_tcgv_vec(arg_temp(a2));
4103        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4104        break;
4105
4106    case INDEX_op_cmpsel_vec:
4107        v2 = temp_tcgv_vec(arg_temp(a2));
4108        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4109        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4110        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4111        break;
4112
4113    default:
4114        break;
4115    }
4116
4117    va_end(va);
4118}
4119
4120static const int tcg_target_callee_save_regs[] = {
4121#if TCG_TARGET_REG_BITS == 64
4122    TCG_REG_RBP,
4123    TCG_REG_RBX,
4124#if defined(_WIN64)
4125    TCG_REG_RDI,
4126    TCG_REG_RSI,
4127#endif
4128    TCG_REG_R12,
4129    TCG_REG_R13,
4130    TCG_REG_R14, /* Currently used for the global env. */
4131    TCG_REG_R15,
4132#else
4133    TCG_REG_EBP, /* Currently used for the global env. */
4134    TCG_REG_EBX,
4135    TCG_REG_ESI,
4136    TCG_REG_EDI,
4137#endif
4138};
4139
4140/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4141   and tcg_register_jit.  */
4142
4143#define PUSH_SIZE \
4144    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4145     * (TCG_TARGET_REG_BITS / 8))
4146
4147#define FRAME_SIZE \
4148    ((PUSH_SIZE \
4149      + TCG_STATIC_CALL_ARGS_SIZE \
4150      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4151      + TCG_TARGET_STACK_ALIGN - 1) \
4152     & ~(TCG_TARGET_STACK_ALIGN - 1))
4153
4154/* Generate global QEMU prologue and epilogue code */
4155static void tcg_target_qemu_prologue(TCGContext *s)
4156{
4157    int i, stack_addend;
4158
4159    /* TB prologue */
4160
4161    /* Reserve some stack space, also for TCG temps.  */
4162    stack_addend = FRAME_SIZE - PUSH_SIZE;
4163    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4164                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4165
4166    /* Save all callee saved registers.  */
4167    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4168        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4169    }
4170
4171    if (!tcg_use_softmmu && guest_base) {
4172        int seg = setup_guest_base_seg();
4173        if (seg != 0) {
4174            x86_guest_base.seg = seg;
4175        } else if (guest_base == (int32_t)guest_base) {
4176            x86_guest_base.ofs = guest_base;
4177        } else {
4178            assert(TCG_TARGET_REG_BITS == 64);
4179            /* Choose R12 because, as a base, it requires a SIB byte. */
4180            x86_guest_base.index = TCG_REG_R12;
4181            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4182            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4183        }
4184    }
4185
4186    if (TCG_TARGET_REG_BITS == 32) {
4187        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4188                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4189        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4190        /* jmp *tb.  */
4191        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4192                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4193                             + stack_addend);
4194    } else {
4195        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4196        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4197        /* jmp *tb.  */
4198        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4199    }
4200
4201    /*
4202     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4203     * and fall through to the rest of the epilogue.
4204     */
4205    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4206    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4207
4208    /* TB epilogue */
4209    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4210
4211    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4212
4213    if (have_avx2) {
4214        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4215    }
4216    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4217        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4218    }
4219    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4220}
4221
4222static void tcg_out_tb_start(TCGContext *s)
4223{
4224    /* nothing to do */
4225}
4226
4227static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4228{
4229    memset(p, 0x90, count);
4230}
4231
4232static void tcg_target_init(TCGContext *s)
4233{
4234    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4235    if (TCG_TARGET_REG_BITS == 64) {
4236        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4237    }
4238    if (have_avx1) {
4239        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4240        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4241    }
4242    if (have_avx2) {
4243        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4244    }
4245
4246    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4247    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4248    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4249    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4250    if (TCG_TARGET_REG_BITS == 64) {
4251#if !defined(_WIN64)
4252        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4253        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4254#endif
4255        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4256        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4257        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4258        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4259    }
4260
4261    s->reserved_regs = 0;
4262    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4263    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4264#ifdef _WIN64
4265    /* These are call saved, and we don't save them, so don't use them. */
4266    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4267    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4268    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4269    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4270    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4271    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4272    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4273    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4274    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4275    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4276#endif
4277}
4278
4279typedef struct {
4280    DebugFrameHeader h;
4281    uint8_t fde_def_cfa[4];
4282    uint8_t fde_reg_ofs[14];
4283} DebugFrame;
4284
4285/* We're expecting a 2 byte uleb128 encoded value.  */
4286QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4287
4288#if !defined(__ELF__)
4289    /* Host machine without ELF. */
4290#elif TCG_TARGET_REG_BITS == 64
4291#define ELF_HOST_MACHINE EM_X86_64
4292static const DebugFrame debug_frame = {
4293    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4294    .h.cie.id = -1,
4295    .h.cie.version = 1,
4296    .h.cie.code_align = 1,
4297    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4298    .h.cie.return_column = 16,
4299
4300    /* Total FDE size does not include the "len" member.  */
4301    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4302
4303    .fde_def_cfa = {
4304        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4305        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4306        (FRAME_SIZE >> 7)
4307    },
4308    .fde_reg_ofs = {
4309        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4310        /* The following ordering must match tcg_target_callee_save_regs.  */
4311        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4312        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4313        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4314        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4315        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4316        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4317    }
4318};
4319#else
4320#define ELF_HOST_MACHINE EM_386
4321static const DebugFrame debug_frame = {
4322    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4323    .h.cie.id = -1,
4324    .h.cie.version = 1,
4325    .h.cie.code_align = 1,
4326    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4327    .h.cie.return_column = 8,
4328
4329    /* Total FDE size does not include the "len" member.  */
4330    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4331
4332    .fde_def_cfa = {
4333        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4334        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4335        (FRAME_SIZE >> 7)
4336    },
4337    .fde_reg_ofs = {
4338        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4339        /* The following ordering must match tcg_target_callee_save_regs.  */
4340        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4341        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4342        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4343        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4344    }
4345};
4346#endif
4347
4348#if defined(ELF_HOST_MACHINE)
4349void tcg_register_jit(const void *buf, size_t buf_size)
4350{
4351    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4352}
4353#endif
4354