xref: /qemu/tcg/i386/tcg-target.c.inc (revision 93c9aeed)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-ldst.c.inc"
26#include "../tcg-pool.c.inc"
27
28#ifdef CONFIG_DEBUG_TCG
29static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30#if TCG_TARGET_REG_BITS == 64
31    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32#else
33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34#endif
35    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37#if TCG_TARGET_REG_BITS == 64
38    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40#endif
41};
42#endif
43
44static const int tcg_target_reg_alloc_order[] = {
45#if TCG_TARGET_REG_BITS == 64
46    TCG_REG_RBP,
47    TCG_REG_RBX,
48    TCG_REG_R12,
49    TCG_REG_R13,
50    TCG_REG_R14,
51    TCG_REG_R15,
52    TCG_REG_R10,
53    TCG_REG_R11,
54    TCG_REG_R9,
55    TCG_REG_R8,
56    TCG_REG_RCX,
57    TCG_REG_RDX,
58    TCG_REG_RSI,
59    TCG_REG_RDI,
60    TCG_REG_RAX,
61#else
62    TCG_REG_EBX,
63    TCG_REG_ESI,
64    TCG_REG_EDI,
65    TCG_REG_EBP,
66    TCG_REG_ECX,
67    TCG_REG_EDX,
68    TCG_REG_EAX,
69#endif
70    TCG_REG_XMM0,
71    TCG_REG_XMM1,
72    TCG_REG_XMM2,
73    TCG_REG_XMM3,
74    TCG_REG_XMM4,
75    TCG_REG_XMM5,
76#ifndef _WIN64
77    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79    TCG_REG_XMM6,
80    TCG_REG_XMM7,
81#if TCG_TARGET_REG_BITS == 64
82    TCG_REG_XMM8,
83    TCG_REG_XMM9,
84    TCG_REG_XMM10,
85    TCG_REG_XMM11,
86    TCG_REG_XMM12,
87    TCG_REG_XMM13,
88    TCG_REG_XMM14,
89    TCG_REG_XMM15,
90#endif
91#endif
92};
93
94static const int tcg_target_call_iarg_regs[] = {
95#if TCG_TARGET_REG_BITS == 64
96#if defined(_WIN64)
97    TCG_REG_RCX,
98    TCG_REG_RDX,
99#else
100    TCG_REG_RDI,
101    TCG_REG_RSI,
102    TCG_REG_RDX,
103    TCG_REG_RCX,
104#endif
105    TCG_REG_R8,
106    TCG_REG_R9,
107#else
108    /* 32 bit mode uses stack based calling convention (GCC default). */
109#endif
110};
111
112static const int tcg_target_call_oarg_regs[] = {
113    TCG_REG_EAX,
114#if TCG_TARGET_REG_BITS == 32
115    TCG_REG_EDX
116#endif
117};
118
119/* Constants we accept.  */
120#define TCG_CT_CONST_S32 0x100
121#define TCG_CT_CONST_U32 0x200
122#define TCG_CT_CONST_I32 0x400
123#define TCG_CT_CONST_WSZ 0x800
124
125/* Registers used with L constraint, which are the first argument
126   registers on x86_64, and two random call clobbered registers on
127   i386. */
128#if TCG_TARGET_REG_BITS == 64
129# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
130# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
131#else
132# define TCG_REG_L0 TCG_REG_EAX
133# define TCG_REG_L1 TCG_REG_EDX
134#endif
135
136#define ALL_BYTEH_REGS         0x0000000fu
137#if TCG_TARGET_REG_BITS == 64
138# define ALL_GENERAL_REGS      0x0000ffffu
139# define ALL_VECTOR_REGS       0xffff0000u
140# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
141#else
142# define ALL_GENERAL_REGS      0x000000ffu
143# define ALL_VECTOR_REGS       0x00ff0000u
144# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
145#endif
146#ifdef CONFIG_SOFTMMU
147# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
148#else
149# define SOFTMMU_RESERVE_REGS  0
150#endif
151
152/* The host compiler should supply <cpuid.h> to enable runtime features
153   detection, as we're not going to go so far as our own inline assembly.
154   If not available, default values will be assumed.  */
155#if defined(CONFIG_CPUID_H)
156#include "qemu/cpuid.h"
157#endif
158
159/* For 64-bit, we always know that CMOV is available.  */
160#if TCG_TARGET_REG_BITS == 64
161# define have_cmov 1
162#elif defined(CONFIG_CPUID_H)
163static bool have_cmov;
164#else
165# define have_cmov 0
166#endif
167
168/* We need these symbols in tcg-target.h, and we can't properly conditionalize
169   it there.  Therefore we always define the variable.  */
170bool have_bmi1;
171bool have_popcnt;
172bool have_avx1;
173bool have_avx2;
174bool have_movbe;
175
176#ifdef CONFIG_CPUID_H
177static bool have_bmi2;
178static bool have_lzcnt;
179#else
180# define have_bmi2 0
181# define have_lzcnt 0
182#endif
183
184static const tcg_insn_unit *tb_ret_addr;
185
186static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
187                        intptr_t value, intptr_t addend)
188{
189    value += addend;
190    switch(type) {
191    case R_386_PC32:
192        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
193        if (value != (int32_t)value) {
194            return false;
195        }
196        /* FALLTHRU */
197    case R_386_32:
198        tcg_patch32(code_ptr, value);
199        break;
200    case R_386_PC8:
201        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
202        if (value != (int8_t)value) {
203            return false;
204        }
205        tcg_patch8(code_ptr, value);
206        break;
207    default:
208        tcg_abort();
209    }
210    return true;
211}
212
213/* test if a constant matches the constraint */
214static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
215{
216    if (ct & TCG_CT_CONST) {
217        return 1;
218    }
219    if (type == TCG_TYPE_I32) {
220        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
221            return 1;
222        }
223    } else {
224        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
225            return 1;
226        }
227        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
228            return 1;
229        }
230        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
231            return 1;
232        }
233    }
234    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
235        return 1;
236    }
237    return 0;
238}
239
240# define LOWREGMASK(x)	((x) & 7)
241
242#define P_EXT		0x100		/* 0x0f opcode prefix */
243#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
244#define P_DATA16        0x400           /* 0x66 opcode prefix */
245#define P_VEXW          0x1000          /* Set VEX.W = 1 */
246#if TCG_TARGET_REG_BITS == 64
247# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
248# define P_REXB_R       0x2000          /* REG field as byte register */
249# define P_REXB_RM      0x4000          /* R/M field as byte register */
250# define P_GS           0x8000          /* gs segment override */
251#else
252# define P_REXW		0
253# define P_REXB_R	0
254# define P_REXB_RM	0
255# define P_GS           0
256#endif
257#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
258#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
259#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
260#define P_VEXL          0x80000         /* Set VEX.L = 1 */
261
262#define OPC_ARITH_EvIz	(0x81)
263#define OPC_ARITH_EvIb	(0x83)
264#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
265#define OPC_ANDN        (0xf2 | P_EXT38)
266#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
267#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
268#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
269#define OPC_BSF         (0xbc | P_EXT)
270#define OPC_BSR         (0xbd | P_EXT)
271#define OPC_BSWAP	(0xc8 | P_EXT)
272#define OPC_CALL_Jz	(0xe8)
273#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
274#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
275#define OPC_DEC_r32	(0x48)
276#define OPC_IMUL_GvEv	(0xaf | P_EXT)
277#define OPC_IMUL_GvEvIb	(0x6b)
278#define OPC_IMUL_GvEvIz	(0x69)
279#define OPC_INC_r32	(0x40)
280#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
281#define OPC_JCC_short	(0x70)		/* ... plus condition code */
282#define OPC_JMP_long	(0xe9)
283#define OPC_JMP_short	(0xeb)
284#define OPC_LEA         (0x8d)
285#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
286#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
287#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
288#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
289#define OPC_MOVB_EvIz   (0xc6)
290#define OPC_MOVL_EvIz	(0xc7)
291#define OPC_MOVL_Iv     (0xb8)
292#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
293#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
294#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
295#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
296#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
297#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
298#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
299#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
300#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
301#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
302#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
303#define OPC_MOVSBL	(0xbe | P_EXT)
304#define OPC_MOVSWL	(0xbf | P_EXT)
305#define OPC_MOVSLQ	(0x63 | P_REXW)
306#define OPC_MOVZBL	(0xb6 | P_EXT)
307#define OPC_MOVZWL	(0xb7 | P_EXT)
308#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
309#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
310#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
311#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
312#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
313#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
314#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
315#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
316#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
317#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
318#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
319#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
320#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
321#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
322#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
323#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
324#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
325#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
326#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
327#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
328#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
329#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
330#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
331#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
332#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
333#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
334#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
335#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
336#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
337#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
338#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
339#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
340#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
341#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
342#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
343#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
344#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
345#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
346#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
347#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
348#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
349#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
350#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
351#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
352#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
353#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
354#define OPC_POR         (0xeb | P_EXT | P_DATA16)
355#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
356#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
357#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
358#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
359#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
360#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
361#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
362#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
363#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
364#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
365#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
366#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
367#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
368#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
369#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
370#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
371#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
372#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
373#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
374#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
375#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
376#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
377#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
378#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
379#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
380#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
381#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
382#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
383#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
384#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
385#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
386#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
387#define OPC_POP_r32	(0x58)
388#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
389#define OPC_PUSH_r32	(0x50)
390#define OPC_PUSH_Iv	(0x68)
391#define OPC_PUSH_Ib	(0x6a)
392#define OPC_RET		(0xc3)
393#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
394#define OPC_SHIFT_1	(0xd1)
395#define OPC_SHIFT_Ib	(0xc1)
396#define OPC_SHIFT_cl	(0xd3)
397#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
398#define OPC_SHUFPS      (0xc6 | P_EXT)
399#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
400#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
401#define OPC_SHRD_Ib     (0xac | P_EXT)
402#define OPC_TESTL	(0x85)
403#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
404#define OPC_UD2         (0x0b | P_EXT)
405#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
406#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
407#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
408#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
409#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
410#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
411#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
412#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
413#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
414#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
415#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
416#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
417#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
418#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
419#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
420#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
421#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
422#define OPC_VZEROUPPER  (0x77 | P_EXT)
423#define OPC_XCHG_ax_r32	(0x90)
424
425#define OPC_GRP3_Eb     (0xf6)
426#define OPC_GRP3_Ev     (0xf7)
427#define OPC_GRP5        (0xff)
428#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
429
430/* Group 1 opcode extensions for 0x80-0x83.
431   These are also used as modifiers for OPC_ARITH.  */
432#define ARITH_ADD 0
433#define ARITH_OR  1
434#define ARITH_ADC 2
435#define ARITH_SBB 3
436#define ARITH_AND 4
437#define ARITH_SUB 5
438#define ARITH_XOR 6
439#define ARITH_CMP 7
440
441/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
442#define SHIFT_ROL 0
443#define SHIFT_ROR 1
444#define SHIFT_SHL 4
445#define SHIFT_SHR 5
446#define SHIFT_SAR 7
447
448/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
449#define EXT3_TESTi 0
450#define EXT3_NOT   2
451#define EXT3_NEG   3
452#define EXT3_MUL   4
453#define EXT3_IMUL  5
454#define EXT3_DIV   6
455#define EXT3_IDIV  7
456
457/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
458#define EXT5_INC_Ev	0
459#define EXT5_DEC_Ev	1
460#define EXT5_CALLN_Ev	2
461#define EXT5_JMPN_Ev	4
462
463/* Condition codes to be added to OPC_JCC_{long,short}.  */
464#define JCC_JMP (-1)
465#define JCC_JO  0x0
466#define JCC_JNO 0x1
467#define JCC_JB  0x2
468#define JCC_JAE 0x3
469#define JCC_JE  0x4
470#define JCC_JNE 0x5
471#define JCC_JBE 0x6
472#define JCC_JA  0x7
473#define JCC_JS  0x8
474#define JCC_JNS 0x9
475#define JCC_JP  0xa
476#define JCC_JNP 0xb
477#define JCC_JL  0xc
478#define JCC_JGE 0xd
479#define JCC_JLE 0xe
480#define JCC_JG  0xf
481
482static const uint8_t tcg_cond_to_jcc[] = {
483    [TCG_COND_EQ] = JCC_JE,
484    [TCG_COND_NE] = JCC_JNE,
485    [TCG_COND_LT] = JCC_JL,
486    [TCG_COND_GE] = JCC_JGE,
487    [TCG_COND_LE] = JCC_JLE,
488    [TCG_COND_GT] = JCC_JG,
489    [TCG_COND_LTU] = JCC_JB,
490    [TCG_COND_GEU] = JCC_JAE,
491    [TCG_COND_LEU] = JCC_JBE,
492    [TCG_COND_GTU] = JCC_JA,
493};
494
495#if TCG_TARGET_REG_BITS == 64
496static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
497{
498    int rex;
499
500    if (opc & P_GS) {
501        tcg_out8(s, 0x65);
502    }
503    if (opc & P_DATA16) {
504        /* We should never be asking for both 16 and 64-bit operation.  */
505        tcg_debug_assert((opc & P_REXW) == 0);
506        tcg_out8(s, 0x66);
507    }
508    if (opc & P_SIMDF3) {
509        tcg_out8(s, 0xf3);
510    } else if (opc & P_SIMDF2) {
511        tcg_out8(s, 0xf2);
512    }
513
514    rex = 0;
515    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
516    rex |= (r & 8) >> 1;                /* REX.R */
517    rex |= (x & 8) >> 2;                /* REX.X */
518    rex |= (rm & 8) >> 3;               /* REX.B */
519
520    /* P_REXB_{R,RM} indicates that the given register is the low byte.
521       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
522       as otherwise the encoding indicates %[abcd]h.  Note that the values
523       that are ORed in merely indicate that the REX byte must be present;
524       those bits get discarded in output.  */
525    rex |= opc & (r >= 4 ? P_REXB_R : 0);
526    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
527
528    if (rex) {
529        tcg_out8(s, (uint8_t)(rex | 0x40));
530    }
531
532    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
533        tcg_out8(s, 0x0f);
534        if (opc & P_EXT38) {
535            tcg_out8(s, 0x38);
536        } else if (opc & P_EXT3A) {
537            tcg_out8(s, 0x3a);
538        }
539    }
540
541    tcg_out8(s, opc);
542}
543#else
544static void tcg_out_opc(TCGContext *s, int opc)
545{
546    if (opc & P_DATA16) {
547        tcg_out8(s, 0x66);
548    }
549    if (opc & P_SIMDF3) {
550        tcg_out8(s, 0xf3);
551    } else if (opc & P_SIMDF2) {
552        tcg_out8(s, 0xf2);
553    }
554    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
555        tcg_out8(s, 0x0f);
556        if (opc & P_EXT38) {
557            tcg_out8(s, 0x38);
558        } else if (opc & P_EXT3A) {
559            tcg_out8(s, 0x3a);
560        }
561    }
562    tcg_out8(s, opc);
563}
564/* Discard the register arguments to tcg_out_opc early, so as not to penalize
565   the 32-bit compilation paths.  This method works with all versions of gcc,
566   whereas relying on optimization may not be able to exclude them.  */
567#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
568#endif
569
570static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
571{
572    tcg_out_opc(s, opc, r, rm, 0);
573    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
574}
575
576static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
577                            int rm, int index)
578{
579    int tmp;
580
581    /* Use the two byte form if possible, which cannot encode
582       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
583    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
584        && ((rm | index) & 8) == 0) {
585        /* Two byte VEX prefix.  */
586        tcg_out8(s, 0xc5);
587
588        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
589    } else {
590        /* Three byte VEX prefix.  */
591        tcg_out8(s, 0xc4);
592
593        /* VEX.m-mmmm */
594        if (opc & P_EXT3A) {
595            tmp = 3;
596        } else if (opc & P_EXT38) {
597            tmp = 2;
598        } else if (opc & P_EXT) {
599            tmp = 1;
600        } else {
601            g_assert_not_reached();
602        }
603        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
604        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
605        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
606        tcg_out8(s, tmp);
607
608        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
609    }
610
611    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
612    /* VEX.pp */
613    if (opc & P_DATA16) {
614        tmp |= 1;                          /* 0x66 */
615    } else if (opc & P_SIMDF3) {
616        tmp |= 2;                          /* 0xf3 */
617    } else if (opc & P_SIMDF2) {
618        tmp |= 3;                          /* 0xf2 */
619    }
620    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
621    tcg_out8(s, tmp);
622    tcg_out8(s, opc);
623}
624
625static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
626{
627    tcg_out_vex_opc(s, opc, r, v, rm, 0);
628    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
629}
630
631/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
632   We handle either RM and INDEX missing with a negative value.  In 64-bit
633   mode for absolute addresses, ~RM is the size of the immediate operand
634   that will follow the instruction.  */
635
636static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
637                               int shift, intptr_t offset)
638{
639    int mod, len;
640
641    if (index < 0 && rm < 0) {
642        if (TCG_TARGET_REG_BITS == 64) {
643            /* Try for a rip-relative addressing mode.  This has replaced
644               the 32-bit-mode absolute addressing encoding.  */
645            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
646            intptr_t disp = offset - pc;
647            if (disp == (int32_t)disp) {
648                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
649                tcg_out32(s, disp);
650                return;
651            }
652
653            /* Try for an absolute address encoding.  This requires the
654               use of the MODRM+SIB encoding and is therefore larger than
655               rip-relative addressing.  */
656            if (offset == (int32_t)offset) {
657                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
658                tcg_out8(s, (4 << 3) | 5);
659                tcg_out32(s, offset);
660                return;
661            }
662
663            /* ??? The memory isn't directly addressable.  */
664            g_assert_not_reached();
665        } else {
666            /* Absolute address.  */
667            tcg_out8(s, (r << 3) | 5);
668            tcg_out32(s, offset);
669            return;
670        }
671    }
672
673    /* Find the length of the immediate addend.  Note that the encoding
674       that would be used for (%ebp) indicates absolute addressing.  */
675    if (rm < 0) {
676        mod = 0, len = 4, rm = 5;
677    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
678        mod = 0, len = 0;
679    } else if (offset == (int8_t)offset) {
680        mod = 0x40, len = 1;
681    } else {
682        mod = 0x80, len = 4;
683    }
684
685    /* Use a single byte MODRM format if possible.  Note that the encoding
686       that would be used for %esp is the escape to the two byte form.  */
687    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
688        /* Single byte MODRM format.  */
689        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
690    } else {
691        /* Two byte MODRM+SIB format.  */
692
693        /* Note that the encoding that would place %esp into the index
694           field indicates no index register.  In 64-bit mode, the REX.X
695           bit counts, so %r12 can be used as the index.  */
696        if (index < 0) {
697            index = 4;
698        } else {
699            tcg_debug_assert(index != TCG_REG_ESP);
700        }
701
702        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
703        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
704    }
705
706    if (len == 1) {
707        tcg_out8(s, offset);
708    } else if (len == 4) {
709        tcg_out32(s, offset);
710    }
711}
712
713static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
714                                     int index, int shift, intptr_t offset)
715{
716    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
717    tcg_out_sib_offset(s, r, rm, index, shift, offset);
718}
719
720static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
721                                         int rm, int index, int shift,
722                                         intptr_t offset)
723{
724    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
725    tcg_out_sib_offset(s, r, rm, index, shift, offset);
726}
727
728/* A simplification of the above with no index or shift.  */
729static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
730                                        int rm, intptr_t offset)
731{
732    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
733}
734
735static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
736                                            int v, int rm, intptr_t offset)
737{
738    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
739}
740
741/* Output an opcode with an expected reference to the constant pool.  */
742static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
743{
744    tcg_out_opc(s, opc, r, 0, 0);
745    /* Absolute for 32-bit, pc-relative for 64-bit.  */
746    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
747    tcg_out32(s, 0);
748}
749
750/* Output an opcode with an expected reference to the constant pool.  */
751static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
752{
753    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
754    /* Absolute for 32-bit, pc-relative for 64-bit.  */
755    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
756    tcg_out32(s, 0);
757}
758
759/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
760static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
761{
762    /* Propagate an opcode prefix, such as P_REXW.  */
763    int ext = subop & ~0x7;
764    subop &= 0x7;
765
766    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
767}
768
769static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
770{
771    int rexw = 0;
772
773    if (arg == ret) {
774        return true;
775    }
776    switch (type) {
777    case TCG_TYPE_I64:
778        rexw = P_REXW;
779        /* fallthru */
780    case TCG_TYPE_I32:
781        if (ret < 16) {
782            if (arg < 16) {
783                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
784            } else {
785                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
786            }
787        } else {
788            if (arg < 16) {
789                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
790            } else {
791                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
792            }
793        }
794        break;
795
796    case TCG_TYPE_V64:
797        tcg_debug_assert(ret >= 16 && arg >= 16);
798        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
799        break;
800    case TCG_TYPE_V128:
801        tcg_debug_assert(ret >= 16 && arg >= 16);
802        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
803        break;
804    case TCG_TYPE_V256:
805        tcg_debug_assert(ret >= 16 && arg >= 16);
806        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
807        break;
808
809    default:
810        g_assert_not_reached();
811    }
812    return true;
813}
814
815static const int avx2_dup_insn[4] = {
816    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
817    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
818};
819
820static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
821                            TCGReg r, TCGReg a)
822{
823    if (have_avx2) {
824        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
825        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
826    } else {
827        switch (vece) {
828        case MO_8:
829            /* ??? With zero in a register, use PSHUFB.  */
830            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
831            a = r;
832            /* FALLTHRU */
833        case MO_16:
834            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
835            a = r;
836            /* FALLTHRU */
837        case MO_32:
838            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
839            /* imm8 operand: all output lanes selected from input lane 0.  */
840            tcg_out8(s, 0);
841            break;
842        case MO_64:
843            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
844            break;
845        default:
846            g_assert_not_reached();
847        }
848    }
849    return true;
850}
851
852static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
853                             TCGReg r, TCGReg base, intptr_t offset)
854{
855    if (have_avx2) {
856        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
857        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
858                                 r, 0, base, offset);
859    } else {
860        switch (vece) {
861        case MO_64:
862            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
863            break;
864        case MO_32:
865            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
866            break;
867        case MO_16:
868            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
869            tcg_out8(s, 0); /* imm8 */
870            tcg_out_dup_vec(s, type, vece, r, r);
871            break;
872        case MO_8:
873            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
874            tcg_out8(s, 0); /* imm8 */
875            tcg_out_dup_vec(s, type, vece, r, r);
876            break;
877        default:
878            g_assert_not_reached();
879        }
880    }
881    return true;
882}
883
884static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
885                             TCGReg ret, int64_t arg)
886{
887    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
888
889    if (arg == 0) {
890        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
891        return;
892    }
893    if (arg == -1) {
894        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
895        return;
896    }
897
898    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
899        if (have_avx2) {
900            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
901        } else {
902            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
903        }
904        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
905    } else {
906        if (type == TCG_TYPE_V64) {
907            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
908        } else if (have_avx2) {
909            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
910        } else {
911            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
912        }
913        if (TCG_TARGET_REG_BITS == 64) {
914            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
915        } else {
916            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
917        }
918    }
919}
920
921static void tcg_out_movi_vec(TCGContext *s, TCGType type,
922                             TCGReg ret, tcg_target_long arg)
923{
924    if (arg == 0) {
925        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
926        return;
927    }
928    if (arg == -1) {
929        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
930        return;
931    }
932
933    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
934    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
935    if (TCG_TARGET_REG_BITS == 64) {
936        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
937    } else {
938        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
939    }
940}
941
942static void tcg_out_movi_int(TCGContext *s, TCGType type,
943                             TCGReg ret, tcg_target_long arg)
944{
945    tcg_target_long diff;
946
947    if (arg == 0) {
948        tgen_arithr(s, ARITH_XOR, ret, ret);
949        return;
950    }
951    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
952        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
953        tcg_out32(s, arg);
954        return;
955    }
956    if (arg == (int32_t)arg) {
957        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
958        tcg_out32(s, arg);
959        return;
960    }
961
962    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
963    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
964    if (diff == (int32_t)diff) {
965        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
966        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
967        tcg_out32(s, diff);
968        return;
969    }
970
971    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
972    tcg_out64(s, arg);
973}
974
975static void tcg_out_movi(TCGContext *s, TCGType type,
976                         TCGReg ret, tcg_target_long arg)
977{
978    switch (type) {
979    case TCG_TYPE_I32:
980#if TCG_TARGET_REG_BITS == 64
981    case TCG_TYPE_I64:
982#endif
983        if (ret < 16) {
984            tcg_out_movi_int(s, type, ret, arg);
985        } else {
986            tcg_out_movi_vec(s, type, ret, arg);
987        }
988        break;
989    default:
990        g_assert_not_reached();
991    }
992}
993
994static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
995{
996    if (val == (int8_t)val) {
997        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
998        tcg_out8(s, val);
999    } else if (val == (int32_t)val) {
1000        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1001        tcg_out32(s, val);
1002    } else {
1003        tcg_abort();
1004    }
1005}
1006
1007static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1008{
1009    /* Given the strength of x86 memory ordering, we only need care for
1010       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1011       faster than "mfence", so don't bother with the sse insn.  */
1012    if (a0 & TCG_MO_ST_LD) {
1013        tcg_out8(s, 0xf0);
1014        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1015        tcg_out8(s, 0);
1016    }
1017}
1018
1019static inline void tcg_out_push(TCGContext *s, int reg)
1020{
1021    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1022}
1023
1024static inline void tcg_out_pop(TCGContext *s, int reg)
1025{
1026    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1027}
1028
1029static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1030                       TCGReg arg1, intptr_t arg2)
1031{
1032    switch (type) {
1033    case TCG_TYPE_I32:
1034        if (ret < 16) {
1035            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1036        } else {
1037            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1038        }
1039        break;
1040    case TCG_TYPE_I64:
1041        if (ret < 16) {
1042            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1043            break;
1044        }
1045        /* FALLTHRU */
1046    case TCG_TYPE_V64:
1047        /* There is no instruction that can validate 8-byte alignment.  */
1048        tcg_debug_assert(ret >= 16);
1049        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1050        break;
1051    case TCG_TYPE_V128:
1052        /*
1053         * The gvec infrastructure is asserts that v128 vector loads
1054         * and stores use a 16-byte aligned offset.  Validate that the
1055         * final pointer is aligned by using an insn that will SIGSEGV.
1056         */
1057        tcg_debug_assert(ret >= 16);
1058        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1059        break;
1060    case TCG_TYPE_V256:
1061        /*
1062         * The gvec infrastructure only requires 16-byte alignment,
1063         * so here we must use an unaligned load.
1064         */
1065        tcg_debug_assert(ret >= 16);
1066        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1067                                 ret, 0, arg1, arg2);
1068        break;
1069    default:
1070        g_assert_not_reached();
1071    }
1072}
1073
1074static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1075                       TCGReg arg1, intptr_t arg2)
1076{
1077    switch (type) {
1078    case TCG_TYPE_I32:
1079        if (arg < 16) {
1080            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1081        } else {
1082            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1083        }
1084        break;
1085    case TCG_TYPE_I64:
1086        if (arg < 16) {
1087            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1088            break;
1089        }
1090        /* FALLTHRU */
1091    case TCG_TYPE_V64:
1092        /* There is no instruction that can validate 8-byte alignment.  */
1093        tcg_debug_assert(arg >= 16);
1094        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1095        break;
1096    case TCG_TYPE_V128:
1097        /*
1098         * The gvec infrastructure is asserts that v128 vector loads
1099         * and stores use a 16-byte aligned offset.  Validate that the
1100         * final pointer is aligned by using an insn that will SIGSEGV.
1101         */
1102        tcg_debug_assert(arg >= 16);
1103        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1104        break;
1105    case TCG_TYPE_V256:
1106        /*
1107         * The gvec infrastructure only requires 16-byte alignment,
1108         * so here we must use an unaligned store.
1109         */
1110        tcg_debug_assert(arg >= 16);
1111        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1112                                 arg, 0, arg1, arg2);
1113        break;
1114    default:
1115        g_assert_not_reached();
1116    }
1117}
1118
1119static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1120                        TCGReg base, intptr_t ofs)
1121{
1122    int rexw = 0;
1123    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1124        if (val != (int32_t)val) {
1125            return false;
1126        }
1127        rexw = P_REXW;
1128    } else if (type != TCG_TYPE_I32) {
1129        return false;
1130    }
1131    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1132    tcg_out32(s, val);
1133    return true;
1134}
1135
1136static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1137{
1138    /* Propagate an opcode prefix, such as P_DATA16.  */
1139    int ext = subopc & ~0x7;
1140    subopc &= 0x7;
1141
1142    if (count == 1) {
1143        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1144    } else {
1145        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1146        tcg_out8(s, count);
1147    }
1148}
1149
1150static inline void tcg_out_bswap32(TCGContext *s, int reg)
1151{
1152    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1153}
1154
1155static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1156{
1157    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1158}
1159
1160static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1161{
1162    /* movzbl */
1163    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1164    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1165}
1166
1167static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1168{
1169    /* movsbl */
1170    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1171    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1172}
1173
1174static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1175{
1176    /* movzwl */
1177    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1178}
1179
1180static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1181{
1182    /* movsw[lq] */
1183    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1184}
1185
1186static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1187{
1188    /* 32-bit mov zero extends.  */
1189    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1190}
1191
1192static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1193{
1194    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1195}
1196
1197static inline void tcg_out_bswap64(TCGContext *s, int reg)
1198{
1199    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1200}
1201
1202static void tgen_arithi(TCGContext *s, int c, int r0,
1203                        tcg_target_long val, int cf)
1204{
1205    int rexw = 0;
1206
1207    if (TCG_TARGET_REG_BITS == 64) {
1208        rexw = c & -8;
1209        c &= 7;
1210    }
1211
1212    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1213       partial flags update stalls on Pentium4 and are not recommended
1214       by current Intel optimization manuals.  */
1215    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1216        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1217        if (TCG_TARGET_REG_BITS == 64) {
1218            /* The single-byte increment encodings are re-tasked as the
1219               REX prefixes.  Use the MODRM encoding.  */
1220            tcg_out_modrm(s, OPC_GRP5 + rexw,
1221                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1222        } else {
1223            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1224        }
1225        return;
1226    }
1227
1228    if (c == ARITH_AND) {
1229        if (TCG_TARGET_REG_BITS == 64) {
1230            if (val == 0xffffffffu) {
1231                tcg_out_ext32u(s, r0, r0);
1232                return;
1233            }
1234            if (val == (uint32_t)val) {
1235                /* AND with no high bits set can use a 32-bit operation.  */
1236                rexw = 0;
1237            }
1238        }
1239        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1240            tcg_out_ext8u(s, r0, r0);
1241            return;
1242        }
1243        if (val == 0xffffu) {
1244            tcg_out_ext16u(s, r0, r0);
1245            return;
1246        }
1247    }
1248
1249    if (val == (int8_t)val) {
1250        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1251        tcg_out8(s, val);
1252        return;
1253    }
1254    if (rexw == 0 || val == (int32_t)val) {
1255        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1256        tcg_out32(s, val);
1257        return;
1258    }
1259
1260    tcg_abort();
1261}
1262
1263static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1264{
1265    if (val != 0) {
1266        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1267    }
1268}
1269
1270/* Use SMALL != 0 to force a short forward branch.  */
1271static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1272{
1273    int32_t val, val1;
1274
1275    if (l->has_value) {
1276        val = tcg_pcrel_diff(s, l->u.value_ptr);
1277        val1 = val - 2;
1278        if ((int8_t)val1 == val1) {
1279            if (opc == -1) {
1280                tcg_out8(s, OPC_JMP_short);
1281            } else {
1282                tcg_out8(s, OPC_JCC_short + opc);
1283            }
1284            tcg_out8(s, val1);
1285        } else {
1286            if (small) {
1287                tcg_abort();
1288            }
1289            if (opc == -1) {
1290                tcg_out8(s, OPC_JMP_long);
1291                tcg_out32(s, val - 5);
1292            } else {
1293                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1294                tcg_out32(s, val - 6);
1295            }
1296        }
1297    } else if (small) {
1298        if (opc == -1) {
1299            tcg_out8(s, OPC_JMP_short);
1300        } else {
1301            tcg_out8(s, OPC_JCC_short + opc);
1302        }
1303        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1304        s->code_ptr += 1;
1305    } else {
1306        if (opc == -1) {
1307            tcg_out8(s, OPC_JMP_long);
1308        } else {
1309            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1310        }
1311        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1312        s->code_ptr += 4;
1313    }
1314}
1315
1316static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1317                        int const_arg2, int rexw)
1318{
1319    if (const_arg2) {
1320        if (arg2 == 0) {
1321            /* test r, r */
1322            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1323        } else {
1324            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1325        }
1326    } else {
1327        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1328    }
1329}
1330
1331static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1332                             TCGArg arg1, TCGArg arg2, int const_arg2,
1333                             TCGLabel *label, int small)
1334{
1335    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1336    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1337}
1338
1339#if TCG_TARGET_REG_BITS == 64
1340static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1341                             TCGArg arg1, TCGArg arg2, int const_arg2,
1342                             TCGLabel *label, int small)
1343{
1344    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1345    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1346}
1347#else
1348/* XXX: we implement it at the target level to avoid having to
1349   handle cross basic blocks temporaries */
1350static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1351                            const int *const_args, int small)
1352{
1353    TCGLabel *label_next = gen_new_label();
1354    TCGLabel *label_this = arg_label(args[5]);
1355
1356    switch(args[4]) {
1357    case TCG_COND_EQ:
1358        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1359                         label_next, 1);
1360        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1361                         label_this, small);
1362        break;
1363    case TCG_COND_NE:
1364        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1365                         label_this, small);
1366        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1367                         label_this, small);
1368        break;
1369    case TCG_COND_LT:
1370        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1371                         label_this, small);
1372        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1373        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1374                         label_this, small);
1375        break;
1376    case TCG_COND_LE:
1377        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1378                         label_this, small);
1379        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1380        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1381                         label_this, small);
1382        break;
1383    case TCG_COND_GT:
1384        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1385                         label_this, small);
1386        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1387        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1388                         label_this, small);
1389        break;
1390    case TCG_COND_GE:
1391        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1392                         label_this, small);
1393        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1394        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1395                         label_this, small);
1396        break;
1397    case TCG_COND_LTU:
1398        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1399                         label_this, small);
1400        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1401        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1402                         label_this, small);
1403        break;
1404    case TCG_COND_LEU:
1405        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1406                         label_this, small);
1407        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1408        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1409                         label_this, small);
1410        break;
1411    case TCG_COND_GTU:
1412        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1413                         label_this, small);
1414        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1415        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1416                         label_this, small);
1417        break;
1418    case TCG_COND_GEU:
1419        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1420                         label_this, small);
1421        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1422        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1423                         label_this, small);
1424        break;
1425    default:
1426        tcg_abort();
1427    }
1428    tcg_out_label(s, label_next);
1429}
1430#endif
1431
1432static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1433                              TCGArg arg1, TCGArg arg2, int const_arg2)
1434{
1435    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1436    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1437    tcg_out_ext8u(s, dest, dest);
1438}
1439
1440#if TCG_TARGET_REG_BITS == 64
1441static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1442                              TCGArg arg1, TCGArg arg2, int const_arg2)
1443{
1444    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1445    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1446    tcg_out_ext8u(s, dest, dest);
1447}
1448#else
1449static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1450                             const int *const_args)
1451{
1452    TCGArg new_args[6];
1453    TCGLabel *label_true, *label_over;
1454
1455    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1456
1457    if (args[0] == args[1] || args[0] == args[2]
1458        || (!const_args[3] && args[0] == args[3])
1459        || (!const_args[4] && args[0] == args[4])) {
1460        /* When the destination overlaps with one of the argument
1461           registers, don't do anything tricky.  */
1462        label_true = gen_new_label();
1463        label_over = gen_new_label();
1464
1465        new_args[5] = label_arg(label_true);
1466        tcg_out_brcond2(s, new_args, const_args+1, 1);
1467
1468        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1469        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1470        tcg_out_label(s, label_true);
1471
1472        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1473        tcg_out_label(s, label_over);
1474    } else {
1475        /* When the destination does not overlap one of the arguments,
1476           clear the destination first, jump if cond false, and emit an
1477           increment in the true case.  This results in smaller code.  */
1478
1479        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1480
1481        label_over = gen_new_label();
1482        new_args[4] = tcg_invert_cond(new_args[4]);
1483        new_args[5] = label_arg(label_over);
1484        tcg_out_brcond2(s, new_args, const_args+1, 1);
1485
1486        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1487        tcg_out_label(s, label_over);
1488    }
1489}
1490#endif
1491
1492static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1493                         TCGReg dest, TCGReg v1)
1494{
1495    if (have_cmov) {
1496        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1497    } else {
1498        TCGLabel *over = gen_new_label();
1499        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1500        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1501        tcg_out_label(s, over);
1502    }
1503}
1504
1505static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1506                              TCGReg c1, TCGArg c2, int const_c2,
1507                              TCGReg v1)
1508{
1509    tcg_out_cmp(s, c1, c2, const_c2, 0);
1510    tcg_out_cmov(s, cond, 0, dest, v1);
1511}
1512
1513#if TCG_TARGET_REG_BITS == 64
1514static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1515                              TCGReg c1, TCGArg c2, int const_c2,
1516                              TCGReg v1)
1517{
1518    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1519    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1520}
1521#endif
1522
1523static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1524                        TCGArg arg2, bool const_a2)
1525{
1526    if (have_bmi1) {
1527        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1528        if (const_a2) {
1529            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1530        } else {
1531            tcg_debug_assert(dest != arg2);
1532            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1533        }
1534    } else {
1535        tcg_debug_assert(dest != arg2);
1536        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1537        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1538    }
1539}
1540
1541static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1542                        TCGArg arg2, bool const_a2)
1543{
1544    if (have_lzcnt) {
1545        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1546        if (const_a2) {
1547            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1548        } else {
1549            tcg_debug_assert(dest != arg2);
1550            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1551        }
1552    } else {
1553        tcg_debug_assert(!const_a2);
1554        tcg_debug_assert(dest != arg1);
1555        tcg_debug_assert(dest != arg2);
1556
1557        /* Recall that the output of BSR is the index not the count.  */
1558        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1559        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1560
1561        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1562        tcg_out_cmp(s, arg1, 0, 1, rexw);
1563        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1564    }
1565}
1566
1567static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1568{
1569    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1570
1571    if (disp == (int32_t)disp) {
1572        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1573        tcg_out32(s, disp);
1574    } else {
1575        /* rip-relative addressing into the constant pool.
1576           This is 6 + 8 = 14 bytes, as compared to using an
1577           an immediate load 10 + 6 = 16 bytes, plus we may
1578           be able to re-use the pool constant for more calls.  */
1579        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1580        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1581        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1582        tcg_out32(s, 0);
1583    }
1584}
1585
1586static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1587{
1588    tcg_out_branch(s, 1, dest);
1589}
1590
1591static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1592{
1593    tcg_out_branch(s, 0, dest);
1594}
1595
1596static void tcg_out_nopn(TCGContext *s, int n)
1597{
1598    int i;
1599    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1600     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1601     * duplicate prefix, and all of the interesting recent cores can
1602     * decode and discard the duplicates in a single cycle.
1603     */
1604    tcg_debug_assert(n >= 1);
1605    for (i = 1; i < n; ++i) {
1606        tcg_out8(s, 0x66);
1607    }
1608    tcg_out8(s, 0x90);
1609}
1610
1611#if defined(CONFIG_SOFTMMU)
1612/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1613 *                                     int mmu_idx, uintptr_t ra)
1614 */
1615static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1616    [MO_UB]   = helper_ret_ldub_mmu,
1617    [MO_LEUW] = helper_le_lduw_mmu,
1618    [MO_LEUL] = helper_le_ldul_mmu,
1619    [MO_LEUQ] = helper_le_ldq_mmu,
1620    [MO_BEUW] = helper_be_lduw_mmu,
1621    [MO_BEUL] = helper_be_ldul_mmu,
1622    [MO_BEUQ] = helper_be_ldq_mmu,
1623};
1624
1625/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1626 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1627 */
1628static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1629    [MO_UB]   = helper_ret_stb_mmu,
1630    [MO_LEUW] = helper_le_stw_mmu,
1631    [MO_LEUL] = helper_le_stl_mmu,
1632    [MO_LEUQ] = helper_le_stq_mmu,
1633    [MO_BEUW] = helper_be_stw_mmu,
1634    [MO_BEUL] = helper_be_stl_mmu,
1635    [MO_BEUQ] = helper_be_stq_mmu,
1636};
1637
1638/* Perform the TLB load and compare.
1639
1640   Inputs:
1641   ADDRLO and ADDRHI contain the low and high part of the address.
1642
1643   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644
1645   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1646   This should be offsetof addr_read or addr_write.
1647
1648   Outputs:
1649   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1650   positions of the displacements of forward jumps to the TLB miss case.
1651
1652   Second argument register is loaded with the low part of the address.
1653   In the TLB hit case, it has been adjusted as indicated by the TLB
1654   and so is a host address.  In the TLB miss case, it continues to
1655   hold a guest address.
1656
1657   First argument register is clobbered.  */
1658
1659static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1660                                    int mem_index, MemOp opc,
1661                                    tcg_insn_unit **label_ptr, int which)
1662{
1663    const TCGReg r0 = TCG_REG_L0;
1664    const TCGReg r1 = TCG_REG_L1;
1665    TCGType ttype = TCG_TYPE_I32;
1666    TCGType tlbtype = TCG_TYPE_I32;
1667    int trexw = 0, hrexw = 0, tlbrexw = 0;
1668    unsigned a_bits = get_alignment_bits(opc);
1669    unsigned s_bits = opc & MO_SIZE;
1670    unsigned a_mask = (1 << a_bits) - 1;
1671    unsigned s_mask = (1 << s_bits) - 1;
1672    target_ulong tlb_mask;
1673
1674    if (TCG_TARGET_REG_BITS == 64) {
1675        if (TARGET_LONG_BITS == 64) {
1676            ttype = TCG_TYPE_I64;
1677            trexw = P_REXW;
1678        }
1679        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680            hrexw = P_REXW;
1681            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1682                tlbtype = TCG_TYPE_I64;
1683                tlbrexw = P_REXW;
1684            }
1685        }
1686    }
1687
1688    tcg_out_mov(s, tlbtype, r0, addrlo);
1689    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1690                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691
1692    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1693                         TLB_MASK_TABLE_OFS(mem_index) +
1694                         offsetof(CPUTLBDescFast, mask));
1695
1696    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1697                         TLB_MASK_TABLE_OFS(mem_index) +
1698                         offsetof(CPUTLBDescFast, table));
1699
1700    /* If the required alignment is at least as large as the access, simply
1701       copy the address and mask.  For lesser alignments, check that we don't
1702       cross pages for the complete access.  */
1703    if (a_bits >= s_bits) {
1704        tcg_out_mov(s, ttype, r1, addrlo);
1705    } else {
1706        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707    }
1708    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1709    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1710
1711    /* cmp 0(r0), r1 */
1712    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713
1714    /* Prepare for both the fast path add of the tlb addend, and the slow
1715       path function argument setup.  */
1716    tcg_out_mov(s, ttype, r1, addrlo);
1717
1718    /* jne slow_path */
1719    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1720    label_ptr[0] = s->code_ptr;
1721    s->code_ptr += 4;
1722
1723    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1724        /* cmp 4(r0), addrhi */
1725        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1726
1727        /* jne slow_path */
1728        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1729        label_ptr[1] = s->code_ptr;
1730        s->code_ptr += 4;
1731    }
1732
1733    /* TLB Hit.  */
1734
1735    /* add addend(r0), r1 */
1736    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1737                         offsetof(CPUTLBEntry, addend));
1738}
1739
1740/*
1741 * Record the context of a call to the out of line helper code for the slow path
1742 * for a load or store, so that we can later generate the correct helper code
1743 */
1744static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745                                MemOpIdx oi,
1746                                TCGReg datalo, TCGReg datahi,
1747                                TCGReg addrlo, TCGReg addrhi,
1748                                tcg_insn_unit *raddr,
1749                                tcg_insn_unit **label_ptr)
1750{
1751    TCGLabelQemuLdst *label = new_ldst_label(s);
1752
1753    label->is_ld = is_ld;
1754    label->oi = oi;
1755    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1756    label->datalo_reg = datalo;
1757    label->datahi_reg = datahi;
1758    label->addrlo_reg = addrlo;
1759    label->addrhi_reg = addrhi;
1760    label->raddr = tcg_splitwx_to_rx(raddr);
1761    label->label_ptr[0] = label_ptr[0];
1762    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1763        label->label_ptr[1] = label_ptr[1];
1764    }
1765}
1766
1767/*
1768 * Generate code for the slow path for a load at the end of block
1769 */
1770static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771{
1772    MemOpIdx oi = l->oi;
1773    MemOp opc = get_memop(oi);
1774    TCGReg data_reg;
1775    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1776    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777
1778    /* resolve label address */
1779    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1780    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1781        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1782    }
1783
1784    if (TCG_TARGET_REG_BITS == 32) {
1785        int ofs = 0;
1786
1787        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1788        ofs += 4;
1789
1790        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1791        ofs += 4;
1792
1793        if (TARGET_LONG_BITS == 64) {
1794            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1795            ofs += 4;
1796        }
1797
1798        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1799        ofs += 4;
1800
1801        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802    } else {
1803        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1804        /* The second argument is already loaded with addrlo.  */
1805        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1806        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1807                     (uintptr_t)l->raddr);
1808    }
1809
1810    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811
1812    data_reg = l->datalo_reg;
1813    switch (opc & MO_SSIZE) {
1814    case MO_SB:
1815        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1816        break;
1817    case MO_SW:
1818        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819        break;
1820#if TCG_TARGET_REG_BITS == 64
1821    case MO_SL:
1822        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1823        break;
1824#endif
1825    case MO_UB:
1826    case MO_UW:
1827        /* Note that the helpers have zero-extended to tcg_target_long.  */
1828    case MO_UL:
1829        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1830        break;
1831    case MO_UQ:
1832        if (TCG_TARGET_REG_BITS == 64) {
1833            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1834        } else if (data_reg == TCG_REG_EDX) {
1835            /* xchg %edx, %eax */
1836            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1837            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838        } else {
1839            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1840            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1841        }
1842        break;
1843    default:
1844        tcg_abort();
1845    }
1846
1847    /* Jump to the code corresponding to next IR of qemu_st */
1848    tcg_out_jmp(s, l->raddr);
1849    return true;
1850}
1851
1852/*
1853 * Generate code for the slow path for a store at the end of block
1854 */
1855static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856{
1857    MemOpIdx oi = l->oi;
1858    MemOp opc = get_memop(oi);
1859    MemOp s_bits = opc & MO_SIZE;
1860    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1861    TCGReg retaddr;
1862
1863    /* resolve label address */
1864    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1865    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1866        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1867    }
1868
1869    if (TCG_TARGET_REG_BITS == 32) {
1870        int ofs = 0;
1871
1872        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1873        ofs += 4;
1874
1875        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1876        ofs += 4;
1877
1878        if (TARGET_LONG_BITS == 64) {
1879            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1880            ofs += 4;
1881        }
1882
1883        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1884        ofs += 4;
1885
1886        if (s_bits == MO_64) {
1887            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1888            ofs += 4;
1889        }
1890
1891        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1892        ofs += 4;
1893
1894        retaddr = TCG_REG_EAX;
1895        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1896        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897    } else {
1898        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1899        /* The second argument is already loaded with addrlo.  */
1900        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1901                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1902        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903
1904        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1905            retaddr = tcg_target_call_iarg_regs[4];
1906            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907        } else {
1908            retaddr = TCG_REG_RAX;
1909            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1910            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1911                       TCG_TARGET_CALL_STACK_OFFSET);
1912        }
1913    }
1914
1915    /* "Tail call" to the helper, with the return address back inline.  */
1916    tcg_out_push(s, retaddr);
1917    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1918    return true;
1919}
1920#else
1921
1922static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
1923                                   TCGReg addrhi, unsigned a_bits)
1924{
1925    unsigned a_mask = (1 << a_bits) - 1;
1926    TCGLabelQemuLdst *label;
1927
1928    /*
1929     * We are expecting a_bits to max out at 7, so we can usually use testb.
1930     * For i686, we have to use testl for %esi/%edi.
1931     */
1932    if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
1933        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
1934        tcg_out8(s, a_mask);
1935    } else {
1936        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
1937        tcg_out32(s, a_mask);
1938    }
1939
1940    /* jne slow_path */
1941    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1942
1943    label = new_ldst_label(s);
1944    label->is_ld = is_ld;
1945    label->addrlo_reg = addrlo;
1946    label->addrhi_reg = addrhi;
1947    label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
1948    label->label_ptr[0] = s->code_ptr;
1949
1950    s->code_ptr += 4;
1951}
1952
1953static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
1954{
1955    /* resolve label address */
1956    tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
1957
1958    if (TCG_TARGET_REG_BITS == 32) {
1959        int ofs = 0;
1960
1961        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1962        ofs += 4;
1963
1964        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1965        ofs += 4;
1966        if (TARGET_LONG_BITS == 64) {
1967            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1968            ofs += 4;
1969        }
1970
1971        tcg_out_pushi(s, (uintptr_t)l->raddr);
1972    } else {
1973        tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
1974                    l->addrlo_reg);
1975        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1976
1977        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
1978        tcg_out_push(s, TCG_REG_RAX);
1979    }
1980
1981    /* "Tail call" to the helper, with the return address back inline. */
1982    tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
1983                                  : helper_unaligned_st));
1984    return true;
1985}
1986
1987static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1988{
1989    return tcg_out_fail_alignment(s, l);
1990}
1991
1992static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1993{
1994    return tcg_out_fail_alignment(s, l);
1995}
1996
1997#if TCG_TARGET_REG_BITS == 32
1998# define x86_guest_base_seg     0
1999# define x86_guest_base_index   -1
2000# define x86_guest_base_offset  guest_base
2001#else
2002static int x86_guest_base_seg;
2003static int x86_guest_base_index = -1;
2004static int32_t x86_guest_base_offset;
2005# if defined(__x86_64__) && defined(__linux__)
2006#  include <asm/prctl.h>
2007#  include <sys/prctl.h>
2008int arch_prctl(int code, unsigned long addr);
2009static inline int setup_guest_base_seg(void)
2010{
2011    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2012        return P_GS;
2013    }
2014    return 0;
2015}
2016# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
2017#  include <machine/sysarch.h>
2018static inline int setup_guest_base_seg(void)
2019{
2020    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2021        return P_GS;
2022    }
2023    return 0;
2024}
2025# else
2026static inline int setup_guest_base_seg(void)
2027{
2028    return 0;
2029}
2030# endif
2031#endif
2032#endif /* SOFTMMU */
2033
2034static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2035                                   TCGReg base, int index, intptr_t ofs,
2036                                   int seg, bool is64, MemOp memop)
2037{
2038    bool use_movbe = false;
2039    int rexw = is64 * P_REXW;
2040    int movop = OPC_MOVL_GvEv;
2041
2042    /* Do big-endian loads with movbe.  */
2043    if (memop & MO_BSWAP) {
2044        tcg_debug_assert(have_movbe);
2045        use_movbe = true;
2046        movop = OPC_MOVBE_GyMy;
2047    }
2048
2049    switch (memop & MO_SSIZE) {
2050    case MO_UB:
2051        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2052                                 base, index, 0, ofs);
2053        break;
2054    case MO_SB:
2055        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2056                                 base, index, 0, ofs);
2057        break;
2058    case MO_UW:
2059        if (use_movbe) {
2060            /* There is no extending movbe; only low 16-bits are modified.  */
2061            if (datalo != base && datalo != index) {
2062                /* XOR breaks dependency chains.  */
2063                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2064                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2065                                         datalo, base, index, 0, ofs);
2066            } else {
2067                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2068                                         datalo, base, index, 0, ofs);
2069                tcg_out_ext16u(s, datalo, datalo);
2070            }
2071        } else {
2072            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2073                                     base, index, 0, ofs);
2074        }
2075        break;
2076    case MO_SW:
2077        if (use_movbe) {
2078            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2079                                     datalo, base, index, 0, ofs);
2080            tcg_out_ext16s(s, datalo, datalo, rexw);
2081        } else {
2082            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2083                                     datalo, base, index, 0, ofs);
2084        }
2085        break;
2086    case MO_UL:
2087        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2088        break;
2089#if TCG_TARGET_REG_BITS == 64
2090    case MO_SL:
2091        if (use_movbe) {
2092            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2093                                     base, index, 0, ofs);
2094            tcg_out_ext32s(s, datalo, datalo);
2095        } else {
2096            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2097                                     base, index, 0, ofs);
2098        }
2099        break;
2100#endif
2101    case MO_UQ:
2102        if (TCG_TARGET_REG_BITS == 64) {
2103            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2104                                     base, index, 0, ofs);
2105        } else {
2106            if (use_movbe) {
2107                TCGReg t = datalo;
2108                datalo = datahi;
2109                datahi = t;
2110            }
2111            if (base != datalo) {
2112                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2113                                         base, index, 0, ofs);
2114                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2115                                         base, index, 0, ofs + 4);
2116            } else {
2117                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2118                                         base, index, 0, ofs + 4);
2119                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2120                                         base, index, 0, ofs);
2121            }
2122        }
2123        break;
2124    default:
2125        g_assert_not_reached();
2126    }
2127}
2128
2129/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2130   EAX. It will be useful once fixed registers globals are less
2131   common. */
2132static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2133{
2134    TCGReg datalo, datahi, addrlo;
2135    TCGReg addrhi __attribute__((unused));
2136    MemOpIdx oi;
2137    MemOp opc;
2138#if defined(CONFIG_SOFTMMU)
2139    int mem_index;
2140    tcg_insn_unit *label_ptr[2];
2141#else
2142    unsigned a_bits;
2143#endif
2144
2145    datalo = *args++;
2146    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2147    addrlo = *args++;
2148    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2149    oi = *args++;
2150    opc = get_memop(oi);
2151
2152#if defined(CONFIG_SOFTMMU)
2153    mem_index = get_mmuidx(oi);
2154
2155    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2156                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2157
2158    /* TLB Hit.  */
2159    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2160
2161    /* Record the current context of a load into ldst label */
2162    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2163                        s->code_ptr, label_ptr);
2164#else
2165    a_bits = get_alignment_bits(opc);
2166    if (a_bits) {
2167        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
2168    }
2169
2170    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2171                           x86_guest_base_offset, x86_guest_base_seg,
2172                           is64, opc);
2173#endif
2174}
2175
2176static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2177                                   TCGReg base, int index, intptr_t ofs,
2178                                   int seg, MemOp memop)
2179{
2180    bool use_movbe = false;
2181    int movop = OPC_MOVL_EvGv;
2182
2183    /*
2184     * Do big-endian stores with movbe or softmmu.
2185     * User-only without movbe will have its swapping done generically.
2186     */
2187    if (memop & MO_BSWAP) {
2188        tcg_debug_assert(have_movbe);
2189        use_movbe = true;
2190        movop = OPC_MOVBE_MyGy;
2191    }
2192
2193    switch (memop & MO_SIZE) {
2194    case MO_8:
2195        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2196        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2197        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2198                                 datalo, base, index, 0, ofs);
2199        break;
2200    case MO_16:
2201        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2202                                 base, index, 0, ofs);
2203        break;
2204    case MO_32:
2205        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2206        break;
2207    case MO_64:
2208        if (TCG_TARGET_REG_BITS == 64) {
2209            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2210                                     base, index, 0, ofs);
2211        } else {
2212            if (use_movbe) {
2213                TCGReg t = datalo;
2214                datalo = datahi;
2215                datahi = t;
2216            }
2217            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2218                                     base, index, 0, ofs);
2219            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2220                                     base, index, 0, ofs + 4);
2221        }
2222        break;
2223    default:
2224        g_assert_not_reached();
2225    }
2226}
2227
2228static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2229{
2230    TCGReg datalo, datahi, addrlo;
2231    TCGReg addrhi __attribute__((unused));
2232    MemOpIdx oi;
2233    MemOp opc;
2234#if defined(CONFIG_SOFTMMU)
2235    int mem_index;
2236    tcg_insn_unit *label_ptr[2];
2237#else
2238    unsigned a_bits;
2239#endif
2240
2241    datalo = *args++;
2242    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2243    addrlo = *args++;
2244    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2245    oi = *args++;
2246    opc = get_memop(oi);
2247
2248#if defined(CONFIG_SOFTMMU)
2249    mem_index = get_mmuidx(oi);
2250
2251    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2252                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2253
2254    /* TLB Hit.  */
2255    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2256
2257    /* Record the current context of a store into ldst label */
2258    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2259                        s->code_ptr, label_ptr);
2260#else
2261    a_bits = get_alignment_bits(opc);
2262    if (a_bits) {
2263        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
2264    }
2265
2266    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2267                           x86_guest_base_offset, x86_guest_base_seg, opc);
2268#endif
2269}
2270
2271static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2272                              const TCGArg args[TCG_MAX_OP_ARGS],
2273                              const int const_args[TCG_MAX_OP_ARGS])
2274{
2275    TCGArg a0, a1, a2;
2276    int c, const_a2, vexop, rexw = 0;
2277
2278#if TCG_TARGET_REG_BITS == 64
2279# define OP_32_64(x) \
2280        case glue(glue(INDEX_op_, x), _i64): \
2281            rexw = P_REXW; /* FALLTHRU */    \
2282        case glue(glue(INDEX_op_, x), _i32)
2283#else
2284# define OP_32_64(x) \
2285        case glue(glue(INDEX_op_, x), _i32)
2286#endif
2287
2288    /* Hoist the loads of the most common arguments.  */
2289    a0 = args[0];
2290    a1 = args[1];
2291    a2 = args[2];
2292    const_a2 = const_args[2];
2293
2294    switch (opc) {
2295    case INDEX_op_exit_tb:
2296        /* Reuse the zeroing that exists for goto_ptr.  */
2297        if (a0 == 0) {
2298            tcg_out_jmp(s, tcg_code_gen_epilogue);
2299        } else {
2300            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2301            tcg_out_jmp(s, tb_ret_addr);
2302        }
2303        break;
2304    case INDEX_op_goto_tb:
2305        if (s->tb_jmp_insn_offset) {
2306            /* direct jump method */
2307            int gap;
2308            /* jump displacement must be aligned for atomic patching;
2309             * see if we need to add extra nops before jump
2310             */
2311            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2312            if (gap != 1) {
2313                tcg_out_nopn(s, gap - 1);
2314            }
2315            tcg_out8(s, OPC_JMP_long); /* jmp im */
2316            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2317            tcg_out32(s, 0);
2318        } else {
2319            /* indirect jump method */
2320            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2321                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2322        }
2323        set_jmp_reset_offset(s, a0);
2324        break;
2325    case INDEX_op_goto_ptr:
2326        /* jmp to the given host address (could be epilogue) */
2327        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2328        break;
2329    case INDEX_op_br:
2330        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2331        break;
2332    OP_32_64(ld8u):
2333        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2334        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2335        break;
2336    OP_32_64(ld8s):
2337        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2338        break;
2339    OP_32_64(ld16u):
2340        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2341        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2342        break;
2343    OP_32_64(ld16s):
2344        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2345        break;
2346#if TCG_TARGET_REG_BITS == 64
2347    case INDEX_op_ld32u_i64:
2348#endif
2349    case INDEX_op_ld_i32:
2350        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2351        break;
2352
2353    OP_32_64(st8):
2354        if (const_args[0]) {
2355            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2356            tcg_out8(s, a0);
2357        } else {
2358            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2359        }
2360        break;
2361    OP_32_64(st16):
2362        if (const_args[0]) {
2363            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2364            tcg_out16(s, a0);
2365        } else {
2366            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2367        }
2368        break;
2369#if TCG_TARGET_REG_BITS == 64
2370    case INDEX_op_st32_i64:
2371#endif
2372    case INDEX_op_st_i32:
2373        if (const_args[0]) {
2374            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2375            tcg_out32(s, a0);
2376        } else {
2377            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2378        }
2379        break;
2380
2381    OP_32_64(add):
2382        /* For 3-operand addition, use LEA.  */
2383        if (a0 != a1) {
2384            TCGArg c3 = 0;
2385            if (const_a2) {
2386                c3 = a2, a2 = -1;
2387            } else if (a0 == a2) {
2388                /* Watch out for dest = src + dest, since we've removed
2389                   the matching constraint on the add.  */
2390                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2391                break;
2392            }
2393
2394            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2395            break;
2396        }
2397        c = ARITH_ADD;
2398        goto gen_arith;
2399    OP_32_64(sub):
2400        c = ARITH_SUB;
2401        goto gen_arith;
2402    OP_32_64(and):
2403        c = ARITH_AND;
2404        goto gen_arith;
2405    OP_32_64(or):
2406        c = ARITH_OR;
2407        goto gen_arith;
2408    OP_32_64(xor):
2409        c = ARITH_XOR;
2410        goto gen_arith;
2411    gen_arith:
2412        if (const_a2) {
2413            tgen_arithi(s, c + rexw, a0, a2, 0);
2414        } else {
2415            tgen_arithr(s, c + rexw, a0, a2);
2416        }
2417        break;
2418
2419    OP_32_64(andc):
2420        if (const_a2) {
2421            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2422            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2423        } else {
2424            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2425        }
2426        break;
2427
2428    OP_32_64(mul):
2429        if (const_a2) {
2430            int32_t val;
2431            val = a2;
2432            if (val == (int8_t)val) {
2433                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2434                tcg_out8(s, val);
2435            } else {
2436                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2437                tcg_out32(s, val);
2438            }
2439        } else {
2440            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2441        }
2442        break;
2443
2444    OP_32_64(div2):
2445        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2446        break;
2447    OP_32_64(divu2):
2448        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2449        break;
2450
2451    OP_32_64(shl):
2452        /* For small constant 3-operand shift, use LEA.  */
2453        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2454            if (a2 - 1 == 0) {
2455                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2456                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2457            } else {
2458                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2459                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2460            }
2461            break;
2462        }
2463        c = SHIFT_SHL;
2464        vexop = OPC_SHLX;
2465        goto gen_shift_maybe_vex;
2466    OP_32_64(shr):
2467        c = SHIFT_SHR;
2468        vexop = OPC_SHRX;
2469        goto gen_shift_maybe_vex;
2470    OP_32_64(sar):
2471        c = SHIFT_SAR;
2472        vexop = OPC_SARX;
2473        goto gen_shift_maybe_vex;
2474    OP_32_64(rotl):
2475        c = SHIFT_ROL;
2476        goto gen_shift;
2477    OP_32_64(rotr):
2478        c = SHIFT_ROR;
2479        goto gen_shift;
2480    gen_shift_maybe_vex:
2481        if (have_bmi2) {
2482            if (!const_a2) {
2483                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2484                break;
2485            }
2486            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2487        }
2488        /* FALLTHRU */
2489    gen_shift:
2490        if (const_a2) {
2491            tcg_out_shifti(s, c + rexw, a0, a2);
2492        } else {
2493            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2494        }
2495        break;
2496
2497    OP_32_64(ctz):
2498        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2499        break;
2500    OP_32_64(clz):
2501        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2502        break;
2503    OP_32_64(ctpop):
2504        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2505        break;
2506
2507    case INDEX_op_brcond_i32:
2508        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2509        break;
2510    case INDEX_op_setcond_i32:
2511        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2512        break;
2513    case INDEX_op_movcond_i32:
2514        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2515        break;
2516
2517    OP_32_64(bswap16):
2518        if (a2 & TCG_BSWAP_OS) {
2519            /* Output must be sign-extended. */
2520            if (rexw) {
2521                tcg_out_bswap64(s, a0);
2522                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2523            } else {
2524                tcg_out_bswap32(s, a0);
2525                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2526            }
2527        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2528            /* Output must be zero-extended, but input isn't. */
2529            tcg_out_bswap32(s, a0);
2530            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2531        } else {
2532            tcg_out_rolw_8(s, a0);
2533        }
2534        break;
2535    OP_32_64(bswap32):
2536        tcg_out_bswap32(s, a0);
2537        if (rexw && (a2 & TCG_BSWAP_OS)) {
2538            tcg_out_ext32s(s, a0, a0);
2539        }
2540        break;
2541
2542    OP_32_64(neg):
2543        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2544        break;
2545    OP_32_64(not):
2546        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2547        break;
2548
2549    OP_32_64(ext8s):
2550        tcg_out_ext8s(s, a0, a1, rexw);
2551        break;
2552    OP_32_64(ext16s):
2553        tcg_out_ext16s(s, a0, a1, rexw);
2554        break;
2555    OP_32_64(ext8u):
2556        tcg_out_ext8u(s, a0, a1);
2557        break;
2558    OP_32_64(ext16u):
2559        tcg_out_ext16u(s, a0, a1);
2560        break;
2561
2562    case INDEX_op_qemu_ld_i32:
2563        tcg_out_qemu_ld(s, args, 0);
2564        break;
2565    case INDEX_op_qemu_ld_i64:
2566        tcg_out_qemu_ld(s, args, 1);
2567        break;
2568    case INDEX_op_qemu_st_i32:
2569    case INDEX_op_qemu_st8_i32:
2570        tcg_out_qemu_st(s, args, 0);
2571        break;
2572    case INDEX_op_qemu_st_i64:
2573        tcg_out_qemu_st(s, args, 1);
2574        break;
2575
2576    OP_32_64(mulu2):
2577        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2578        break;
2579    OP_32_64(muls2):
2580        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2581        break;
2582    OP_32_64(add2):
2583        if (const_args[4]) {
2584            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2585        } else {
2586            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2587        }
2588        if (const_args[5]) {
2589            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2590        } else {
2591            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2592        }
2593        break;
2594    OP_32_64(sub2):
2595        if (const_args[4]) {
2596            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2597        } else {
2598            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2599        }
2600        if (const_args[5]) {
2601            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2602        } else {
2603            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2604        }
2605        break;
2606
2607#if TCG_TARGET_REG_BITS == 32
2608    case INDEX_op_brcond2_i32:
2609        tcg_out_brcond2(s, args, const_args, 0);
2610        break;
2611    case INDEX_op_setcond2_i32:
2612        tcg_out_setcond2(s, args, const_args);
2613        break;
2614#else /* TCG_TARGET_REG_BITS == 64 */
2615    case INDEX_op_ld32s_i64:
2616        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2617        break;
2618    case INDEX_op_ld_i64:
2619        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2620        break;
2621    case INDEX_op_st_i64:
2622        if (const_args[0]) {
2623            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2624            tcg_out32(s, a0);
2625        } else {
2626            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2627        }
2628        break;
2629
2630    case INDEX_op_brcond_i64:
2631        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2632        break;
2633    case INDEX_op_setcond_i64:
2634        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2635        break;
2636    case INDEX_op_movcond_i64:
2637        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2638        break;
2639
2640    case INDEX_op_bswap64_i64:
2641        tcg_out_bswap64(s, a0);
2642        break;
2643    case INDEX_op_extu_i32_i64:
2644    case INDEX_op_ext32u_i64:
2645    case INDEX_op_extrl_i64_i32:
2646        tcg_out_ext32u(s, a0, a1);
2647        break;
2648    case INDEX_op_ext_i32_i64:
2649    case INDEX_op_ext32s_i64:
2650        tcg_out_ext32s(s, a0, a1);
2651        break;
2652    case INDEX_op_extrh_i64_i32:
2653        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2654        break;
2655#endif
2656
2657    OP_32_64(deposit):
2658        if (args[3] == 0 && args[4] == 8) {
2659            /* load bits 0..7 */
2660            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2661        } else if (args[3] == 8 && args[4] == 8) {
2662            /* load bits 8..15 */
2663            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2664        } else if (args[3] == 0 && args[4] == 16) {
2665            /* load bits 0..15 */
2666            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2667        } else {
2668            tcg_abort();
2669        }
2670        break;
2671
2672    case INDEX_op_extract_i64:
2673        if (a2 + args[3] == 32) {
2674            /* This is a 32-bit zero-extending right shift.  */
2675            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2676            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2677            break;
2678        }
2679        /* FALLTHRU */
2680    case INDEX_op_extract_i32:
2681        /* On the off-chance that we can use the high-byte registers.
2682           Otherwise we emit the same ext16 + shift pattern that we
2683           would have gotten from the normal tcg-op.c expansion.  */
2684        tcg_debug_assert(a2 == 8 && args[3] == 8);
2685        if (a1 < 4 && a0 < 8) {
2686            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2687        } else {
2688            tcg_out_ext16u(s, a0, a1);
2689            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2690        }
2691        break;
2692
2693    case INDEX_op_sextract_i32:
2694        /* We don't implement sextract_i64, as we cannot sign-extend to
2695           64-bits without using the REX prefix that explicitly excludes
2696           access to the high-byte registers.  */
2697        tcg_debug_assert(a2 == 8 && args[3] == 8);
2698        if (a1 < 4 && a0 < 8) {
2699            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2700        } else {
2701            tcg_out_ext16s(s, a0, a1, 0);
2702            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2703        }
2704        break;
2705
2706    OP_32_64(extract2):
2707        /* Note that SHRD outputs to the r/m operand.  */
2708        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2709        tcg_out8(s, args[3]);
2710        break;
2711
2712    case INDEX_op_mb:
2713        tcg_out_mb(s, a0);
2714        break;
2715    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2716    case INDEX_op_mov_i64:
2717    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2718    default:
2719        tcg_abort();
2720    }
2721
2722#undef OP_32_64
2723}
2724
2725static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2726                           unsigned vecl, unsigned vece,
2727                           const TCGArg args[TCG_MAX_OP_ARGS],
2728                           const int const_args[TCG_MAX_OP_ARGS])
2729{
2730    static int const add_insn[4] = {
2731        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2732    };
2733    static int const ssadd_insn[4] = {
2734        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2735    };
2736    static int const usadd_insn[4] = {
2737        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2738    };
2739    static int const sub_insn[4] = {
2740        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2741    };
2742    static int const sssub_insn[4] = {
2743        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2744    };
2745    static int const ussub_insn[4] = {
2746        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2747    };
2748    static int const mul_insn[4] = {
2749        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2750    };
2751    static int const shift_imm_insn[4] = {
2752        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2753    };
2754    static int const cmpeq_insn[4] = {
2755        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2756    };
2757    static int const cmpgt_insn[4] = {
2758        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2759    };
2760    static int const punpckl_insn[4] = {
2761        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2762    };
2763    static int const punpckh_insn[4] = {
2764        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2765    };
2766    static int const packss_insn[4] = {
2767        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2768    };
2769    static int const packus_insn[4] = {
2770        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2771    };
2772    static int const smin_insn[4] = {
2773        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2774    };
2775    static int const smax_insn[4] = {
2776        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2777    };
2778    static int const umin_insn[4] = {
2779        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2780    };
2781    static int const umax_insn[4] = {
2782        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2783    };
2784    static int const shlv_insn[4] = {
2785        /* TODO: AVX512 adds support for MO_16.  */
2786        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2787    };
2788    static int const shrv_insn[4] = {
2789        /* TODO: AVX512 adds support for MO_16.  */
2790        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2791    };
2792    static int const sarv_insn[4] = {
2793        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2794        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2795    };
2796    static int const shls_insn[4] = {
2797        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2798    };
2799    static int const shrs_insn[4] = {
2800        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2801    };
2802    static int const sars_insn[4] = {
2803        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2804    };
2805    static int const abs_insn[4] = {
2806        /* TODO: AVX512 adds support for MO_64.  */
2807        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2808    };
2809
2810    TCGType type = vecl + TCG_TYPE_V64;
2811    int insn, sub;
2812    TCGArg a0, a1, a2;
2813
2814    a0 = args[0];
2815    a1 = args[1];
2816    a2 = args[2];
2817
2818    switch (opc) {
2819    case INDEX_op_add_vec:
2820        insn = add_insn[vece];
2821        goto gen_simd;
2822    case INDEX_op_ssadd_vec:
2823        insn = ssadd_insn[vece];
2824        goto gen_simd;
2825    case INDEX_op_usadd_vec:
2826        insn = usadd_insn[vece];
2827        goto gen_simd;
2828    case INDEX_op_sub_vec:
2829        insn = sub_insn[vece];
2830        goto gen_simd;
2831    case INDEX_op_sssub_vec:
2832        insn = sssub_insn[vece];
2833        goto gen_simd;
2834    case INDEX_op_ussub_vec:
2835        insn = ussub_insn[vece];
2836        goto gen_simd;
2837    case INDEX_op_mul_vec:
2838        insn = mul_insn[vece];
2839        goto gen_simd;
2840    case INDEX_op_and_vec:
2841        insn = OPC_PAND;
2842        goto gen_simd;
2843    case INDEX_op_or_vec:
2844        insn = OPC_POR;
2845        goto gen_simd;
2846    case INDEX_op_xor_vec:
2847        insn = OPC_PXOR;
2848        goto gen_simd;
2849    case INDEX_op_smin_vec:
2850        insn = smin_insn[vece];
2851        goto gen_simd;
2852    case INDEX_op_umin_vec:
2853        insn = umin_insn[vece];
2854        goto gen_simd;
2855    case INDEX_op_smax_vec:
2856        insn = smax_insn[vece];
2857        goto gen_simd;
2858    case INDEX_op_umax_vec:
2859        insn = umax_insn[vece];
2860        goto gen_simd;
2861    case INDEX_op_shlv_vec:
2862        insn = shlv_insn[vece];
2863        goto gen_simd;
2864    case INDEX_op_shrv_vec:
2865        insn = shrv_insn[vece];
2866        goto gen_simd;
2867    case INDEX_op_sarv_vec:
2868        insn = sarv_insn[vece];
2869        goto gen_simd;
2870    case INDEX_op_shls_vec:
2871        insn = shls_insn[vece];
2872        goto gen_simd;
2873    case INDEX_op_shrs_vec:
2874        insn = shrs_insn[vece];
2875        goto gen_simd;
2876    case INDEX_op_sars_vec:
2877        insn = sars_insn[vece];
2878        goto gen_simd;
2879    case INDEX_op_x86_punpckl_vec:
2880        insn = punpckl_insn[vece];
2881        goto gen_simd;
2882    case INDEX_op_x86_punpckh_vec:
2883        insn = punpckh_insn[vece];
2884        goto gen_simd;
2885    case INDEX_op_x86_packss_vec:
2886        insn = packss_insn[vece];
2887        goto gen_simd;
2888    case INDEX_op_x86_packus_vec:
2889        insn = packus_insn[vece];
2890        goto gen_simd;
2891#if TCG_TARGET_REG_BITS == 32
2892    case INDEX_op_dup2_vec:
2893        /* First merge the two 32-bit inputs to a single 64-bit element. */
2894        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2895        /* Then replicate the 64-bit elements across the rest of the vector. */
2896        if (type != TCG_TYPE_V64) {
2897            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2898        }
2899        break;
2900#endif
2901    case INDEX_op_abs_vec:
2902        insn = abs_insn[vece];
2903        a2 = a1;
2904        a1 = 0;
2905        goto gen_simd;
2906    gen_simd:
2907        tcg_debug_assert(insn != OPC_UD2);
2908        if (type == TCG_TYPE_V256) {
2909            insn |= P_VEXL;
2910        }
2911        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2912        break;
2913
2914    case INDEX_op_cmp_vec:
2915        sub = args[3];
2916        if (sub == TCG_COND_EQ) {
2917            insn = cmpeq_insn[vece];
2918        } else if (sub == TCG_COND_GT) {
2919            insn = cmpgt_insn[vece];
2920        } else {
2921            g_assert_not_reached();
2922        }
2923        goto gen_simd;
2924
2925    case INDEX_op_andc_vec:
2926        insn = OPC_PANDN;
2927        if (type == TCG_TYPE_V256) {
2928            insn |= P_VEXL;
2929        }
2930        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2931        break;
2932
2933    case INDEX_op_shli_vec:
2934        sub = 6;
2935        goto gen_shift;
2936    case INDEX_op_shri_vec:
2937        sub = 2;
2938        goto gen_shift;
2939    case INDEX_op_sari_vec:
2940        tcg_debug_assert(vece != MO_64);
2941        sub = 4;
2942    gen_shift:
2943        tcg_debug_assert(vece != MO_8);
2944        insn = shift_imm_insn[vece];
2945        if (type == TCG_TYPE_V256) {
2946            insn |= P_VEXL;
2947        }
2948        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2949        tcg_out8(s, a2);
2950        break;
2951
2952    case INDEX_op_ld_vec:
2953        tcg_out_ld(s, type, a0, a1, a2);
2954        break;
2955    case INDEX_op_st_vec:
2956        tcg_out_st(s, type, a0, a1, a2);
2957        break;
2958    case INDEX_op_dupm_vec:
2959        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2960        break;
2961
2962    case INDEX_op_x86_shufps_vec:
2963        insn = OPC_SHUFPS;
2964        sub = args[3];
2965        goto gen_simd_imm8;
2966    case INDEX_op_x86_blend_vec:
2967        if (vece == MO_16) {
2968            insn = OPC_PBLENDW;
2969        } else if (vece == MO_32) {
2970            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2971        } else {
2972            g_assert_not_reached();
2973        }
2974        sub = args[3];
2975        goto gen_simd_imm8;
2976    case INDEX_op_x86_vperm2i128_vec:
2977        insn = OPC_VPERM2I128;
2978        sub = args[3];
2979        goto gen_simd_imm8;
2980    gen_simd_imm8:
2981        if (type == TCG_TYPE_V256) {
2982            insn |= P_VEXL;
2983        }
2984        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2985        tcg_out8(s, sub);
2986        break;
2987
2988    case INDEX_op_x86_vpblendvb_vec:
2989        insn = OPC_VPBLENDVB;
2990        if (type == TCG_TYPE_V256) {
2991            insn |= P_VEXL;
2992        }
2993        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2994        tcg_out8(s, args[3] << 4);
2995        break;
2996
2997    case INDEX_op_x86_psrldq_vec:
2998        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2999        tcg_out8(s, a2);
3000        break;
3001
3002    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3003    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3004    default:
3005        g_assert_not_reached();
3006    }
3007}
3008
3009static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3010{
3011    switch (op) {
3012    case INDEX_op_goto_ptr:
3013        return C_O0_I1(r);
3014
3015    case INDEX_op_ld8u_i32:
3016    case INDEX_op_ld8u_i64:
3017    case INDEX_op_ld8s_i32:
3018    case INDEX_op_ld8s_i64:
3019    case INDEX_op_ld16u_i32:
3020    case INDEX_op_ld16u_i64:
3021    case INDEX_op_ld16s_i32:
3022    case INDEX_op_ld16s_i64:
3023    case INDEX_op_ld_i32:
3024    case INDEX_op_ld32u_i64:
3025    case INDEX_op_ld32s_i64:
3026    case INDEX_op_ld_i64:
3027        return C_O1_I1(r, r);
3028
3029    case INDEX_op_st8_i32:
3030    case INDEX_op_st8_i64:
3031        return C_O0_I2(qi, r);
3032
3033    case INDEX_op_st16_i32:
3034    case INDEX_op_st16_i64:
3035    case INDEX_op_st_i32:
3036    case INDEX_op_st32_i64:
3037        return C_O0_I2(ri, r);
3038
3039    case INDEX_op_st_i64:
3040        return C_O0_I2(re, r);
3041
3042    case INDEX_op_add_i32:
3043    case INDEX_op_add_i64:
3044        return C_O1_I2(r, r, re);
3045
3046    case INDEX_op_sub_i32:
3047    case INDEX_op_sub_i64:
3048    case INDEX_op_mul_i32:
3049    case INDEX_op_mul_i64:
3050    case INDEX_op_or_i32:
3051    case INDEX_op_or_i64:
3052    case INDEX_op_xor_i32:
3053    case INDEX_op_xor_i64:
3054        return C_O1_I2(r, 0, re);
3055
3056    case INDEX_op_and_i32:
3057    case INDEX_op_and_i64:
3058        return C_O1_I2(r, 0, reZ);
3059
3060    case INDEX_op_andc_i32:
3061    case INDEX_op_andc_i64:
3062        return C_O1_I2(r, r, rI);
3063
3064    case INDEX_op_shl_i32:
3065    case INDEX_op_shl_i64:
3066    case INDEX_op_shr_i32:
3067    case INDEX_op_shr_i64:
3068    case INDEX_op_sar_i32:
3069    case INDEX_op_sar_i64:
3070        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3071
3072    case INDEX_op_rotl_i32:
3073    case INDEX_op_rotl_i64:
3074    case INDEX_op_rotr_i32:
3075    case INDEX_op_rotr_i64:
3076        return C_O1_I2(r, 0, ci);
3077
3078    case INDEX_op_brcond_i32:
3079    case INDEX_op_brcond_i64:
3080        return C_O0_I2(r, re);
3081
3082    case INDEX_op_bswap16_i32:
3083    case INDEX_op_bswap16_i64:
3084    case INDEX_op_bswap32_i32:
3085    case INDEX_op_bswap32_i64:
3086    case INDEX_op_bswap64_i64:
3087    case INDEX_op_neg_i32:
3088    case INDEX_op_neg_i64:
3089    case INDEX_op_not_i32:
3090    case INDEX_op_not_i64:
3091    case INDEX_op_extrh_i64_i32:
3092        return C_O1_I1(r, 0);
3093
3094    case INDEX_op_ext8s_i32:
3095    case INDEX_op_ext8s_i64:
3096    case INDEX_op_ext8u_i32:
3097    case INDEX_op_ext8u_i64:
3098        return C_O1_I1(r, q);
3099
3100    case INDEX_op_ext16s_i32:
3101    case INDEX_op_ext16s_i64:
3102    case INDEX_op_ext16u_i32:
3103    case INDEX_op_ext16u_i64:
3104    case INDEX_op_ext32s_i64:
3105    case INDEX_op_ext32u_i64:
3106    case INDEX_op_ext_i32_i64:
3107    case INDEX_op_extu_i32_i64:
3108    case INDEX_op_extrl_i64_i32:
3109    case INDEX_op_extract_i32:
3110    case INDEX_op_extract_i64:
3111    case INDEX_op_sextract_i32:
3112    case INDEX_op_ctpop_i32:
3113    case INDEX_op_ctpop_i64:
3114        return C_O1_I1(r, r);
3115
3116    case INDEX_op_extract2_i32:
3117    case INDEX_op_extract2_i64:
3118        return C_O1_I2(r, 0, r);
3119
3120    case INDEX_op_deposit_i32:
3121    case INDEX_op_deposit_i64:
3122        return C_O1_I2(Q, 0, Q);
3123
3124    case INDEX_op_setcond_i32:
3125    case INDEX_op_setcond_i64:
3126        return C_O1_I2(q, r, re);
3127
3128    case INDEX_op_movcond_i32:
3129    case INDEX_op_movcond_i64:
3130        return C_O1_I4(r, r, re, r, 0);
3131
3132    case INDEX_op_div2_i32:
3133    case INDEX_op_div2_i64:
3134    case INDEX_op_divu2_i32:
3135    case INDEX_op_divu2_i64:
3136        return C_O2_I3(a, d, 0, 1, r);
3137
3138    case INDEX_op_mulu2_i32:
3139    case INDEX_op_mulu2_i64:
3140    case INDEX_op_muls2_i32:
3141    case INDEX_op_muls2_i64:
3142        return C_O2_I2(a, d, a, r);
3143
3144    case INDEX_op_add2_i32:
3145    case INDEX_op_add2_i64:
3146    case INDEX_op_sub2_i32:
3147    case INDEX_op_sub2_i64:
3148        return C_O2_I4(r, r, 0, 1, re, re);
3149
3150    case INDEX_op_ctz_i32:
3151    case INDEX_op_ctz_i64:
3152        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3153
3154    case INDEX_op_clz_i32:
3155    case INDEX_op_clz_i64:
3156        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3157
3158    case INDEX_op_qemu_ld_i32:
3159        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3160                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3161
3162    case INDEX_op_qemu_st_i32:
3163        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3164                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3165    case INDEX_op_qemu_st8_i32:
3166        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3167                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3168
3169    case INDEX_op_qemu_ld_i64:
3170        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3171                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3172                : C_O2_I2(r, r, L, L));
3173
3174    case INDEX_op_qemu_st_i64:
3175        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3176                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3177                : C_O0_I4(L, L, L, L));
3178
3179    case INDEX_op_brcond2_i32:
3180        return C_O0_I4(r, r, ri, ri);
3181
3182    case INDEX_op_setcond2_i32:
3183        return C_O1_I4(r, r, r, ri, ri);
3184
3185    case INDEX_op_ld_vec:
3186    case INDEX_op_dupm_vec:
3187        return C_O1_I1(x, r);
3188
3189    case INDEX_op_st_vec:
3190        return C_O0_I2(x, r);
3191
3192    case INDEX_op_add_vec:
3193    case INDEX_op_sub_vec:
3194    case INDEX_op_mul_vec:
3195    case INDEX_op_and_vec:
3196    case INDEX_op_or_vec:
3197    case INDEX_op_xor_vec:
3198    case INDEX_op_andc_vec:
3199    case INDEX_op_ssadd_vec:
3200    case INDEX_op_usadd_vec:
3201    case INDEX_op_sssub_vec:
3202    case INDEX_op_ussub_vec:
3203    case INDEX_op_smin_vec:
3204    case INDEX_op_umin_vec:
3205    case INDEX_op_smax_vec:
3206    case INDEX_op_umax_vec:
3207    case INDEX_op_shlv_vec:
3208    case INDEX_op_shrv_vec:
3209    case INDEX_op_sarv_vec:
3210    case INDEX_op_shls_vec:
3211    case INDEX_op_shrs_vec:
3212    case INDEX_op_sars_vec:
3213    case INDEX_op_rotls_vec:
3214    case INDEX_op_cmp_vec:
3215    case INDEX_op_x86_shufps_vec:
3216    case INDEX_op_x86_blend_vec:
3217    case INDEX_op_x86_packss_vec:
3218    case INDEX_op_x86_packus_vec:
3219    case INDEX_op_x86_vperm2i128_vec:
3220    case INDEX_op_x86_punpckl_vec:
3221    case INDEX_op_x86_punpckh_vec:
3222#if TCG_TARGET_REG_BITS == 32
3223    case INDEX_op_dup2_vec:
3224#endif
3225        return C_O1_I2(x, x, x);
3226
3227    case INDEX_op_abs_vec:
3228    case INDEX_op_dup_vec:
3229    case INDEX_op_shli_vec:
3230    case INDEX_op_shri_vec:
3231    case INDEX_op_sari_vec:
3232    case INDEX_op_x86_psrldq_vec:
3233        return C_O1_I1(x, x);
3234
3235    case INDEX_op_x86_vpblendvb_vec:
3236        return C_O1_I3(x, x, x, x);
3237
3238    default:
3239        g_assert_not_reached();
3240    }
3241}
3242
3243int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3244{
3245    switch (opc) {
3246    case INDEX_op_add_vec:
3247    case INDEX_op_sub_vec:
3248    case INDEX_op_and_vec:
3249    case INDEX_op_or_vec:
3250    case INDEX_op_xor_vec:
3251    case INDEX_op_andc_vec:
3252        return 1;
3253    case INDEX_op_rotli_vec:
3254    case INDEX_op_cmp_vec:
3255    case INDEX_op_cmpsel_vec:
3256        return -1;
3257
3258    case INDEX_op_shli_vec:
3259    case INDEX_op_shri_vec:
3260        /* We must expand the operation for MO_8.  */
3261        return vece == MO_8 ? -1 : 1;
3262
3263    case INDEX_op_sari_vec:
3264        /* We must expand the operation for MO_8.  */
3265        if (vece == MO_8) {
3266            return -1;
3267        }
3268        /* We can emulate this for MO_64, but it does not pay off
3269           unless we're producing at least 4 values.  */
3270        if (vece == MO_64) {
3271            return type >= TCG_TYPE_V256 ? -1 : 0;
3272        }
3273        return 1;
3274
3275    case INDEX_op_shls_vec:
3276    case INDEX_op_shrs_vec:
3277        return vece >= MO_16;
3278    case INDEX_op_sars_vec:
3279        return vece >= MO_16 && vece <= MO_32;
3280    case INDEX_op_rotls_vec:
3281        return vece >= MO_16 ? -1 : 0;
3282
3283    case INDEX_op_shlv_vec:
3284    case INDEX_op_shrv_vec:
3285        return have_avx2 && vece >= MO_32;
3286    case INDEX_op_sarv_vec:
3287        return have_avx2 && vece == MO_32;
3288    case INDEX_op_rotlv_vec:
3289    case INDEX_op_rotrv_vec:
3290        return have_avx2 && vece >= MO_32 ? -1 : 0;
3291
3292    case INDEX_op_mul_vec:
3293        if (vece == MO_8) {
3294            /* We can expand the operation for MO_8.  */
3295            return -1;
3296        }
3297        if (vece == MO_64) {
3298            return 0;
3299        }
3300        return 1;
3301
3302    case INDEX_op_ssadd_vec:
3303    case INDEX_op_usadd_vec:
3304    case INDEX_op_sssub_vec:
3305    case INDEX_op_ussub_vec:
3306        return vece <= MO_16;
3307    case INDEX_op_smin_vec:
3308    case INDEX_op_smax_vec:
3309    case INDEX_op_umin_vec:
3310    case INDEX_op_umax_vec:
3311    case INDEX_op_abs_vec:
3312        return vece <= MO_32;
3313
3314    default:
3315        return 0;
3316    }
3317}
3318
3319static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3320                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3321{
3322    TCGv_vec t1, t2;
3323
3324    tcg_debug_assert(vece == MO_8);
3325
3326    t1 = tcg_temp_new_vec(type);
3327    t2 = tcg_temp_new_vec(type);
3328
3329    /*
3330     * Unpack to W, shift, and repack.  Tricky bits:
3331     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3332     *     i.e. duplicate in other half of the 16-bit lane.
3333     * (2) For right-shift, add 8 so that the high half of the lane
3334     *     becomes zero.  For left-shift, and left-rotate, we must
3335     *     shift up and down again.
3336     * (3) Step 2 leaves high half zero such that PACKUSWB
3337     *     (pack with unsigned saturation) does not modify
3338     *     the quantity.
3339     */
3340    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3341              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3342    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3343              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3344
3345    if (opc != INDEX_op_rotli_vec) {
3346        imm += 8;
3347    }
3348    if (opc == INDEX_op_shri_vec) {
3349        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3350        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3351    } else {
3352        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3353        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3354        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3355        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3356    }
3357
3358    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3359              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3360    tcg_temp_free_vec(t1);
3361    tcg_temp_free_vec(t2);
3362}
3363
3364static void expand_vec_sari(TCGType type, unsigned vece,
3365                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3366{
3367    TCGv_vec t1, t2;
3368
3369    switch (vece) {
3370    case MO_8:
3371        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3372        t1 = tcg_temp_new_vec(type);
3373        t2 = tcg_temp_new_vec(type);
3374        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3375                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3376        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3377                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3378        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3379        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3380        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3381                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3382        tcg_temp_free_vec(t1);
3383        tcg_temp_free_vec(t2);
3384        break;
3385
3386    case MO_64:
3387        if (imm <= 32) {
3388            /*
3389             * We can emulate a small sign extend by performing an arithmetic
3390             * 32-bit shift and overwriting the high half of a 64-bit logical
3391             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3392             * does not, so we have to bound the smaller shift -- we get the
3393             * same result in the high half either way.
3394             */
3395            t1 = tcg_temp_new_vec(type);
3396            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3397            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3398            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3399                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3400                      tcgv_vec_arg(t1), 0xaa);
3401            tcg_temp_free_vec(t1);
3402        } else {
3403            /* Otherwise we will need to use a compare vs 0 to produce
3404             * the sign-extend, shift and merge.
3405             */
3406            t1 = tcg_const_zeros_vec(type);
3407            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3408            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3409            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3410            tcg_gen_or_vec(MO_64, v0, v0, t1);
3411            tcg_temp_free_vec(t1);
3412        }
3413        break;
3414
3415    default:
3416        g_assert_not_reached();
3417    }
3418}
3419
3420static void expand_vec_rotli(TCGType type, unsigned vece,
3421                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3422{
3423    TCGv_vec t;
3424
3425    if (vece == MO_8) {
3426        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3427        return;
3428    }
3429
3430    t = tcg_temp_new_vec(type);
3431    tcg_gen_shli_vec(vece, t, v1, imm);
3432    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3433    tcg_gen_or_vec(vece, v0, v0, t);
3434    tcg_temp_free_vec(t);
3435}
3436
3437static void expand_vec_rotls(TCGType type, unsigned vece,
3438                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3439{
3440    TCGv_i32 rsh;
3441    TCGv_vec t;
3442
3443    tcg_debug_assert(vece != MO_8);
3444
3445    t = tcg_temp_new_vec(type);
3446    rsh = tcg_temp_new_i32();
3447
3448    tcg_gen_neg_i32(rsh, lsh);
3449    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3450    tcg_gen_shls_vec(vece, t, v1, lsh);
3451    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3452    tcg_gen_or_vec(vece, v0, v0, t);
3453    tcg_temp_free_vec(t);
3454    tcg_temp_free_i32(rsh);
3455}
3456
3457static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3458                            TCGv_vec v1, TCGv_vec sh, bool right)
3459{
3460    TCGv_vec t = tcg_temp_new_vec(type);
3461
3462    tcg_gen_dupi_vec(vece, t, 8 << vece);
3463    tcg_gen_sub_vec(vece, t, t, sh);
3464    if (right) {
3465        tcg_gen_shlv_vec(vece, t, v1, t);
3466        tcg_gen_shrv_vec(vece, v0, v1, sh);
3467    } else {
3468        tcg_gen_shrv_vec(vece, t, v1, t);
3469        tcg_gen_shlv_vec(vece, v0, v1, sh);
3470    }
3471    tcg_gen_or_vec(vece, v0, v0, t);
3472    tcg_temp_free_vec(t);
3473}
3474
3475static void expand_vec_mul(TCGType type, unsigned vece,
3476                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3477{
3478    TCGv_vec t1, t2, t3, t4, zero;
3479
3480    tcg_debug_assert(vece == MO_8);
3481
3482    /*
3483     * Unpack v1 bytes to words, 0 | x.
3484     * Unpack v2 bytes to words, y | 0.
3485     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3486     * Shift logical right by 8 bits to clear the high 8 bytes before
3487     * using an unsigned saturated pack.
3488     *
3489     * The difference between the V64, V128 and V256 cases is merely how
3490     * we distribute the expansion between temporaries.
3491     */
3492    switch (type) {
3493    case TCG_TYPE_V64:
3494        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3495        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3496        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3497        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3498                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3499        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3500                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3501        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3502        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3503        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3504                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3505        tcg_temp_free_vec(t1);
3506        tcg_temp_free_vec(t2);
3507        break;
3508
3509    case TCG_TYPE_V128:
3510    case TCG_TYPE_V256:
3511        t1 = tcg_temp_new_vec(type);
3512        t2 = tcg_temp_new_vec(type);
3513        t3 = tcg_temp_new_vec(type);
3514        t4 = tcg_temp_new_vec(type);
3515        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3516        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3517                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3518        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3519                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3520        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3521                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3522        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3523                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3524        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3525        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3526        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3527        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3528        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3529                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3530        tcg_temp_free_vec(t1);
3531        tcg_temp_free_vec(t2);
3532        tcg_temp_free_vec(t3);
3533        tcg_temp_free_vec(t4);
3534        break;
3535
3536    default:
3537        g_assert_not_reached();
3538    }
3539}
3540
3541static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3542                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3543{
3544    enum {
3545        NEED_INV  = 1,
3546        NEED_SWAP = 2,
3547        NEED_BIAS = 4,
3548        NEED_UMIN = 8,
3549        NEED_UMAX = 16,
3550    };
3551    TCGv_vec t1, t2, t3;
3552    uint8_t fixup;
3553
3554    switch (cond) {
3555    case TCG_COND_EQ:
3556    case TCG_COND_GT:
3557        fixup = 0;
3558        break;
3559    case TCG_COND_NE:
3560    case TCG_COND_LE:
3561        fixup = NEED_INV;
3562        break;
3563    case TCG_COND_LT:
3564        fixup = NEED_SWAP;
3565        break;
3566    case TCG_COND_GE:
3567        fixup = NEED_SWAP | NEED_INV;
3568        break;
3569    case TCG_COND_LEU:
3570        if (vece <= MO_32) {
3571            fixup = NEED_UMIN;
3572        } else {
3573            fixup = NEED_BIAS | NEED_INV;
3574        }
3575        break;
3576    case TCG_COND_GTU:
3577        if (vece <= MO_32) {
3578            fixup = NEED_UMIN | NEED_INV;
3579        } else {
3580            fixup = NEED_BIAS;
3581        }
3582        break;
3583    case TCG_COND_GEU:
3584        if (vece <= MO_32) {
3585            fixup = NEED_UMAX;
3586        } else {
3587            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3588        }
3589        break;
3590    case TCG_COND_LTU:
3591        if (vece <= MO_32) {
3592            fixup = NEED_UMAX | NEED_INV;
3593        } else {
3594            fixup = NEED_BIAS | NEED_SWAP;
3595        }
3596        break;
3597    default:
3598        g_assert_not_reached();
3599    }
3600
3601    if (fixup & NEED_INV) {
3602        cond = tcg_invert_cond(cond);
3603    }
3604    if (fixup & NEED_SWAP) {
3605        t1 = v1, v1 = v2, v2 = t1;
3606        cond = tcg_swap_cond(cond);
3607    }
3608
3609    t1 = t2 = NULL;
3610    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3611        t1 = tcg_temp_new_vec(type);
3612        if (fixup & NEED_UMIN) {
3613            tcg_gen_umin_vec(vece, t1, v1, v2);
3614        } else {
3615            tcg_gen_umax_vec(vece, t1, v1, v2);
3616        }
3617        v2 = t1;
3618        cond = TCG_COND_EQ;
3619    } else if (fixup & NEED_BIAS) {
3620        t1 = tcg_temp_new_vec(type);
3621        t2 = tcg_temp_new_vec(type);
3622        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3623        tcg_gen_sub_vec(vece, t1, v1, t3);
3624        tcg_gen_sub_vec(vece, t2, v2, t3);
3625        v1 = t1;
3626        v2 = t2;
3627        cond = tcg_signed_cond(cond);
3628    }
3629
3630    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3631    /* Expand directly; do not recurse.  */
3632    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3633              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3634
3635    if (t1) {
3636        tcg_temp_free_vec(t1);
3637        if (t2) {
3638            tcg_temp_free_vec(t2);
3639        }
3640    }
3641    return fixup & NEED_INV;
3642}
3643
3644static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3645                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3646{
3647    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3648        tcg_gen_not_vec(vece, v0, v0);
3649    }
3650}
3651
3652static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3653                              TCGv_vec c1, TCGv_vec c2,
3654                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3655{
3656    TCGv_vec t = tcg_temp_new_vec(type);
3657
3658    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3659        /* Invert the sense of the compare by swapping arguments.  */
3660        TCGv_vec x;
3661        x = v3, v3 = v4, v4 = x;
3662    }
3663    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3664              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3665              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3666    tcg_temp_free_vec(t);
3667}
3668
3669void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3670                       TCGArg a0, ...)
3671{
3672    va_list va;
3673    TCGArg a2;
3674    TCGv_vec v0, v1, v2, v3, v4;
3675
3676    va_start(va, a0);
3677    v0 = temp_tcgv_vec(arg_temp(a0));
3678    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3679    a2 = va_arg(va, TCGArg);
3680
3681    switch (opc) {
3682    case INDEX_op_shli_vec:
3683    case INDEX_op_shri_vec:
3684        expand_vec_shi(type, vece, opc, v0, v1, a2);
3685        break;
3686
3687    case INDEX_op_sari_vec:
3688        expand_vec_sari(type, vece, v0, v1, a2);
3689        break;
3690
3691    case INDEX_op_rotli_vec:
3692        expand_vec_rotli(type, vece, v0, v1, a2);
3693        break;
3694
3695    case INDEX_op_rotls_vec:
3696        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3697        break;
3698
3699    case INDEX_op_rotlv_vec:
3700        v2 = temp_tcgv_vec(arg_temp(a2));
3701        expand_vec_rotv(type, vece, v0, v1, v2, false);
3702        break;
3703    case INDEX_op_rotrv_vec:
3704        v2 = temp_tcgv_vec(arg_temp(a2));
3705        expand_vec_rotv(type, vece, v0, v1, v2, true);
3706        break;
3707
3708    case INDEX_op_mul_vec:
3709        v2 = temp_tcgv_vec(arg_temp(a2));
3710        expand_vec_mul(type, vece, v0, v1, v2);
3711        break;
3712
3713    case INDEX_op_cmp_vec:
3714        v2 = temp_tcgv_vec(arg_temp(a2));
3715        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3716        break;
3717
3718    case INDEX_op_cmpsel_vec:
3719        v2 = temp_tcgv_vec(arg_temp(a2));
3720        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3721        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3722        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3723        break;
3724
3725    default:
3726        break;
3727    }
3728
3729    va_end(va);
3730}
3731
3732static const int tcg_target_callee_save_regs[] = {
3733#if TCG_TARGET_REG_BITS == 64
3734    TCG_REG_RBP,
3735    TCG_REG_RBX,
3736#if defined(_WIN64)
3737    TCG_REG_RDI,
3738    TCG_REG_RSI,
3739#endif
3740    TCG_REG_R12,
3741    TCG_REG_R13,
3742    TCG_REG_R14, /* Currently used for the global env. */
3743    TCG_REG_R15,
3744#else
3745    TCG_REG_EBP, /* Currently used for the global env. */
3746    TCG_REG_EBX,
3747    TCG_REG_ESI,
3748    TCG_REG_EDI,
3749#endif
3750};
3751
3752/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3753   and tcg_register_jit.  */
3754
3755#define PUSH_SIZE \
3756    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3757     * (TCG_TARGET_REG_BITS / 8))
3758
3759#define FRAME_SIZE \
3760    ((PUSH_SIZE \
3761      + TCG_STATIC_CALL_ARGS_SIZE \
3762      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3763      + TCG_TARGET_STACK_ALIGN - 1) \
3764     & ~(TCG_TARGET_STACK_ALIGN - 1))
3765
3766/* Generate global QEMU prologue and epilogue code */
3767static void tcg_target_qemu_prologue(TCGContext *s)
3768{
3769    int i, stack_addend;
3770
3771    /* TB prologue */
3772
3773    /* Reserve some stack space, also for TCG temps.  */
3774    stack_addend = FRAME_SIZE - PUSH_SIZE;
3775    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3776                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3777
3778    /* Save all callee saved registers.  */
3779    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3780        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3781    }
3782
3783#if TCG_TARGET_REG_BITS == 32
3784    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3785               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3786    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3787    /* jmp *tb.  */
3788    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3789                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3790                         + stack_addend);
3791#else
3792# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3793    if (guest_base) {
3794        int seg = setup_guest_base_seg();
3795        if (seg != 0) {
3796            x86_guest_base_seg = seg;
3797        } else if (guest_base == (int32_t)guest_base) {
3798            x86_guest_base_offset = guest_base;
3799        } else {
3800            /* Choose R12 because, as a base, it requires a SIB byte. */
3801            x86_guest_base_index = TCG_REG_R12;
3802            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3803            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3804        }
3805    }
3806# endif
3807    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3808    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3809    /* jmp *tb.  */
3810    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3811#endif
3812
3813    /*
3814     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3815     * and fall through to the rest of the epilogue.
3816     */
3817    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3818    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3819
3820    /* TB epilogue */
3821    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3822
3823    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3824
3825    if (have_avx2) {
3826        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3827    }
3828    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3829        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3830    }
3831    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3832}
3833
3834static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3835{
3836    memset(p, 0x90, count);
3837}
3838
3839static void tcg_target_init(TCGContext *s)
3840{
3841#ifdef CONFIG_CPUID_H
3842    unsigned a, b, c, d, b7 = 0;
3843    unsigned max = __get_cpuid_max(0, 0);
3844
3845    if (max >= 7) {
3846        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3847        __cpuid_count(7, 0, a, b7, c, d);
3848        have_bmi1 = (b7 & bit_BMI) != 0;
3849        have_bmi2 = (b7 & bit_BMI2) != 0;
3850    }
3851
3852    if (max >= 1) {
3853        __cpuid(1, a, b, c, d);
3854#ifndef have_cmov
3855        /* For 32-bit, 99% certainty that we're running on hardware that
3856           supports cmov, but we still need to check.  In case cmov is not
3857           available, we'll use a small forward branch.  */
3858        have_cmov = (d & bit_CMOV) != 0;
3859#endif
3860
3861        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3862           need to probe for it.  */
3863        have_movbe = (c & bit_MOVBE) != 0;
3864        have_popcnt = (c & bit_POPCNT) != 0;
3865
3866        /* There are a number of things we must check before we can be
3867           sure of not hitting invalid opcode.  */
3868        if (c & bit_OSXSAVE) {
3869            unsigned xcrl, xcrh;
3870            /* The xgetbv instruction is not available to older versions of
3871             * the assembler, so we encode the instruction manually.
3872             */
3873            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3874            if ((xcrl & 6) == 6) {
3875                have_avx1 = (c & bit_AVX) != 0;
3876                have_avx2 = (b7 & bit_AVX2) != 0;
3877            }
3878        }
3879    }
3880
3881    max = __get_cpuid_max(0x8000000, 0);
3882    if (max >= 1) {
3883        __cpuid(0x80000001, a, b, c, d);
3884        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3885        have_lzcnt = (c & bit_LZCNT) != 0;
3886    }
3887#endif /* CONFIG_CPUID_H */
3888
3889    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3890    if (TCG_TARGET_REG_BITS == 64) {
3891        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3892    }
3893    if (have_avx1) {
3894        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3895        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3896    }
3897    if (have_avx2) {
3898        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3899    }
3900
3901    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3902    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3903    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3904    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3905    if (TCG_TARGET_REG_BITS == 64) {
3906#if !defined(_WIN64)
3907        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3908        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3909#endif
3910        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3911        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3912        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3913        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3914    }
3915
3916    s->reserved_regs = 0;
3917    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3918}
3919
3920typedef struct {
3921    DebugFrameHeader h;
3922    uint8_t fde_def_cfa[4];
3923    uint8_t fde_reg_ofs[14];
3924} DebugFrame;
3925
3926/* We're expecting a 2 byte uleb128 encoded value.  */
3927QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3928
3929#if !defined(__ELF__)
3930    /* Host machine without ELF. */
3931#elif TCG_TARGET_REG_BITS == 64
3932#define ELF_HOST_MACHINE EM_X86_64
3933static const DebugFrame debug_frame = {
3934    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3935    .h.cie.id = -1,
3936    .h.cie.version = 1,
3937    .h.cie.code_align = 1,
3938    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3939    .h.cie.return_column = 16,
3940
3941    /* Total FDE size does not include the "len" member.  */
3942    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3943
3944    .fde_def_cfa = {
3945        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3946        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3947        (FRAME_SIZE >> 7)
3948    },
3949    .fde_reg_ofs = {
3950        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3951        /* The following ordering must match tcg_target_callee_save_regs.  */
3952        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3953        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3954        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3955        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3956        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3957        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3958    }
3959};
3960#else
3961#define ELF_HOST_MACHINE EM_386
3962static const DebugFrame debug_frame = {
3963    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3964    .h.cie.id = -1,
3965    .h.cie.version = 1,
3966    .h.cie.code_align = 1,
3967    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3968    .h.cie.return_column = 8,
3969
3970    /* Total FDE size does not include the "len" member.  */
3971    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3972
3973    .fde_def_cfa = {
3974        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3975        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3976        (FRAME_SIZE >> 7)
3977    },
3978    .fde_reg_ofs = {
3979        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3980        /* The following ordering must match tcg_target_callee_save_regs.  */
3981        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3982        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3983        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3984        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3985    }
3986};
3987#endif
3988
3989#if defined(ELF_HOST_MACHINE)
3990void tcg_register_jit(const void *buf, size_t buf_size)
3991{
3992    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3993}
3994#endif
3995