xref: /qemu/tcg/i386/tcg-target.c.inc (revision d0fb9657)
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "../tcg-pool.c.inc"
26
27#ifdef CONFIG_DEBUG_TCG
28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29#if TCG_TARGET_REG_BITS == 64
30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31#else
32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33#endif
34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39#endif
40};
41#endif
42
43static const int tcg_target_reg_alloc_order[] = {
44#if TCG_TARGET_REG_BITS == 64
45    TCG_REG_RBP,
46    TCG_REG_RBX,
47    TCG_REG_R12,
48    TCG_REG_R13,
49    TCG_REG_R14,
50    TCG_REG_R15,
51    TCG_REG_R10,
52    TCG_REG_R11,
53    TCG_REG_R9,
54    TCG_REG_R8,
55    TCG_REG_RCX,
56    TCG_REG_RDX,
57    TCG_REG_RSI,
58    TCG_REG_RDI,
59    TCG_REG_RAX,
60#else
61    TCG_REG_EBX,
62    TCG_REG_ESI,
63    TCG_REG_EDI,
64    TCG_REG_EBP,
65    TCG_REG_ECX,
66    TCG_REG_EDX,
67    TCG_REG_EAX,
68#endif
69    TCG_REG_XMM0,
70    TCG_REG_XMM1,
71    TCG_REG_XMM2,
72    TCG_REG_XMM3,
73    TCG_REG_XMM4,
74    TCG_REG_XMM5,
75#ifndef _WIN64
76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
78    TCG_REG_XMM6,
79    TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81    TCG_REG_XMM8,
82    TCG_REG_XMM9,
83    TCG_REG_XMM10,
84    TCG_REG_XMM11,
85    TCG_REG_XMM12,
86    TCG_REG_XMM13,
87    TCG_REG_XMM14,
88    TCG_REG_XMM15,
89#endif
90#endif
91};
92
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
95#if defined(_WIN64)
96    TCG_REG_RCX,
97    TCG_REG_RDX,
98#else
99    TCG_REG_RDI,
100    TCG_REG_RSI,
101    TCG_REG_RDX,
102    TCG_REG_RCX,
103#endif
104    TCG_REG_R8,
105    TCG_REG_R9,
106#else
107    /* 32 bit mode uses stack based calling convention (GCC default). */
108#endif
109};
110
111static const int tcg_target_call_oarg_regs[] = {
112    TCG_REG_EAX,
113#if TCG_TARGET_REG_BITS == 32
114    TCG_REG_EDX
115#endif
116};
117
118/* Constants we accept.  */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
121#define TCG_CT_CONST_I32 0x400
122#define TCG_CT_CONST_WSZ 0x800
123
124/* Registers used with L constraint, which are the first argument
125   registers on x86_64, and two random call clobbered registers on
126   i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
135#define ALL_BYTEH_REGS         0x0000000fu
136#if TCG_TARGET_REG_BITS == 64
137# define ALL_GENERAL_REGS      0x0000ffffu
138# define ALL_VECTOR_REGS       0xffff0000u
139# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
140#else
141# define ALL_GENERAL_REGS      0x000000ffu
142# define ALL_VECTOR_REGS       0x00ff0000u
143# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
144#endif
145#ifdef CONFIG_SOFTMMU
146# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
147#else
148# define SOFTMMU_RESERVE_REGS  0
149#endif
150
151/* The host compiler should supply <cpuid.h> to enable runtime features
152   detection, as we're not going to go so far as our own inline assembly.
153   If not available, default values will be assumed.  */
154#if defined(CONFIG_CPUID_H)
155#include "qemu/cpuid.h"
156#endif
157
158/* For 64-bit, we always know that CMOV is available.  */
159#if TCG_TARGET_REG_BITS == 64
160# define have_cmov 1
161#elif defined(CONFIG_CPUID_H)
162static bool have_cmov;
163#else
164# define have_cmov 0
165#endif
166
167/* We need these symbols in tcg-target.h, and we can't properly conditionalize
168   it there.  Therefore we always define the variable.  */
169bool have_bmi1;
170bool have_popcnt;
171bool have_avx1;
172bool have_avx2;
173bool have_movbe;
174
175#ifdef CONFIG_CPUID_H
176static bool have_bmi2;
177static bool have_lzcnt;
178#else
179# define have_bmi2 0
180# define have_lzcnt 0
181#endif
182
183static const tcg_insn_unit *tb_ret_addr;
184
185static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
186                        intptr_t value, intptr_t addend)
187{
188    value += addend;
189    switch(type) {
190    case R_386_PC32:
191        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
192        if (value != (int32_t)value) {
193            return false;
194        }
195        /* FALLTHRU */
196    case R_386_32:
197        tcg_patch32(code_ptr, value);
198        break;
199    case R_386_PC8:
200        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
201        if (value != (int8_t)value) {
202            return false;
203        }
204        tcg_patch8(code_ptr, value);
205        break;
206    default:
207        tcg_abort();
208    }
209    return true;
210}
211
212/* test if a constant matches the constraint */
213static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
214                                         const TCGArgConstraint *arg_ct)
215{
216    int ct = arg_ct->ct;
217    if (ct & TCG_CT_CONST) {
218        return 1;
219    }
220    if (type == TCG_TYPE_I32) {
221        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
222            return 1;
223        }
224    } else {
225        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
226            return 1;
227        }
228        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
229            return 1;
230        }
231        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
232            return 1;
233        }
234    }
235    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
236        return 1;
237    }
238    return 0;
239}
240
241# define LOWREGMASK(x)	((x) & 7)
242
243#define P_EXT		0x100		/* 0x0f opcode prefix */
244#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
245#define P_DATA16        0x400           /* 0x66 opcode prefix */
246#if TCG_TARGET_REG_BITS == 64
247# define P_REXW         0x1000          /* Set REX.W = 1 */
248# define P_REXB_R       0x2000          /* REG field as byte register */
249# define P_REXB_RM      0x4000          /* R/M field as byte register */
250# define P_GS           0x8000          /* gs segment override */
251#else
252# define P_REXW		0
253# define P_REXB_R	0
254# define P_REXB_RM	0
255# define P_GS           0
256#endif
257#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
258#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
259#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
260#define P_VEXL          0x80000         /* Set VEX.L = 1 */
261
262#define OPC_ARITH_EvIz	(0x81)
263#define OPC_ARITH_EvIb	(0x83)
264#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
265#define OPC_ANDN        (0xf2 | P_EXT38)
266#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
267#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
268#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
269#define OPC_BSF         (0xbc | P_EXT)
270#define OPC_BSR         (0xbd | P_EXT)
271#define OPC_BSWAP	(0xc8 | P_EXT)
272#define OPC_CALL_Jz	(0xe8)
273#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
274#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
275#define OPC_DEC_r32	(0x48)
276#define OPC_IMUL_GvEv	(0xaf | P_EXT)
277#define OPC_IMUL_GvEvIb	(0x6b)
278#define OPC_IMUL_GvEvIz	(0x69)
279#define OPC_INC_r32	(0x40)
280#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
281#define OPC_JCC_short	(0x70)		/* ... plus condition code */
282#define OPC_JMP_long	(0xe9)
283#define OPC_JMP_short	(0xeb)
284#define OPC_LEA         (0x8d)
285#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
286#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
287#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
288#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
289#define OPC_MOVB_EvIz   (0xc6)
290#define OPC_MOVL_EvIz	(0xc7)
291#define OPC_MOVL_Iv     (0xb8)
292#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
293#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
294#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
295#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
296#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
297#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
298#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
299#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
300#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
301#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
302#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
303#define OPC_MOVSBL	(0xbe | P_EXT)
304#define OPC_MOVSWL	(0xbf | P_EXT)
305#define OPC_MOVSLQ	(0x63 | P_REXW)
306#define OPC_MOVZBL	(0xb6 | P_EXT)
307#define OPC_MOVZWL	(0xb7 | P_EXT)
308#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
309#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
310#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
311#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
312#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
313#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
314#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
315#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
316#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
317#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
318#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
319#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
320#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
321#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
322#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
323#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
324#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
325#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
326#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
327#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
328#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
329#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
330#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
331#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
332#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
333#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
334#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
335#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
336#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
337#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
338#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
339#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
340#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
341#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
342#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
343#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
344#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
345#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
346#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
347#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
348#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
349#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
350#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
351#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
352#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
353#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
354#define OPC_POR         (0xeb | P_EXT | P_DATA16)
355#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
356#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
357#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
358#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
359#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
360#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
361#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
362#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
363#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
364#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
365#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
366#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
367#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
368#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
369#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
370#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
371#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
372#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
373#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
374#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
375#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
376#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
377#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
378#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
379#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
380#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
381#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
382#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
383#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
384#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
385#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
386#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
387#define OPC_POP_r32	(0x58)
388#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
389#define OPC_PUSH_r32	(0x50)
390#define OPC_PUSH_Iv	(0x68)
391#define OPC_PUSH_Ib	(0x6a)
392#define OPC_RET		(0xc3)
393#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
394#define OPC_SHIFT_1	(0xd1)
395#define OPC_SHIFT_Ib	(0xc1)
396#define OPC_SHIFT_cl	(0xd3)
397#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
398#define OPC_SHUFPS      (0xc6 | P_EXT)
399#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
400#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
401#define OPC_SHRD_Ib     (0xac | P_EXT)
402#define OPC_TESTL	(0x85)
403#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
404#define OPC_UD2         (0x0b | P_EXT)
405#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
406#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
407#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
408#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
409#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
410#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
411#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
412#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
413#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
414#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
415#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
416#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
417#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
418#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
419#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
420#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
421#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
422#define OPC_VZEROUPPER  (0x77 | P_EXT)
423#define OPC_XCHG_ax_r32	(0x90)
424
425#define OPC_GRP3_Ev	(0xf7)
426#define OPC_GRP5	(0xff)
427#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
428
429/* Group 1 opcode extensions for 0x80-0x83.
430   These are also used as modifiers for OPC_ARITH.  */
431#define ARITH_ADD 0
432#define ARITH_OR  1
433#define ARITH_ADC 2
434#define ARITH_SBB 3
435#define ARITH_AND 4
436#define ARITH_SUB 5
437#define ARITH_XOR 6
438#define ARITH_CMP 7
439
440/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
441#define SHIFT_ROL 0
442#define SHIFT_ROR 1
443#define SHIFT_SHL 4
444#define SHIFT_SHR 5
445#define SHIFT_SAR 7
446
447/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
448#define EXT3_NOT   2
449#define EXT3_NEG   3
450#define EXT3_MUL   4
451#define EXT3_IMUL  5
452#define EXT3_DIV   6
453#define EXT3_IDIV  7
454
455/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
456#define EXT5_INC_Ev	0
457#define EXT5_DEC_Ev	1
458#define EXT5_CALLN_Ev	2
459#define EXT5_JMPN_Ev	4
460
461/* Condition codes to be added to OPC_JCC_{long,short}.  */
462#define JCC_JMP (-1)
463#define JCC_JO  0x0
464#define JCC_JNO 0x1
465#define JCC_JB  0x2
466#define JCC_JAE 0x3
467#define JCC_JE  0x4
468#define JCC_JNE 0x5
469#define JCC_JBE 0x6
470#define JCC_JA  0x7
471#define JCC_JS  0x8
472#define JCC_JNS 0x9
473#define JCC_JP  0xa
474#define JCC_JNP 0xb
475#define JCC_JL  0xc
476#define JCC_JGE 0xd
477#define JCC_JLE 0xe
478#define JCC_JG  0xf
479
480static const uint8_t tcg_cond_to_jcc[] = {
481    [TCG_COND_EQ] = JCC_JE,
482    [TCG_COND_NE] = JCC_JNE,
483    [TCG_COND_LT] = JCC_JL,
484    [TCG_COND_GE] = JCC_JGE,
485    [TCG_COND_LE] = JCC_JLE,
486    [TCG_COND_GT] = JCC_JG,
487    [TCG_COND_LTU] = JCC_JB,
488    [TCG_COND_GEU] = JCC_JAE,
489    [TCG_COND_LEU] = JCC_JBE,
490    [TCG_COND_GTU] = JCC_JA,
491};
492
493#if TCG_TARGET_REG_BITS == 64
494static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
495{
496    int rex;
497
498    if (opc & P_GS) {
499        tcg_out8(s, 0x65);
500    }
501    if (opc & P_DATA16) {
502        /* We should never be asking for both 16 and 64-bit operation.  */
503        tcg_debug_assert((opc & P_REXW) == 0);
504        tcg_out8(s, 0x66);
505    }
506    if (opc & P_SIMDF3) {
507        tcg_out8(s, 0xf3);
508    } else if (opc & P_SIMDF2) {
509        tcg_out8(s, 0xf2);
510    }
511
512    rex = 0;
513    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
514    rex |= (r & 8) >> 1;                /* REX.R */
515    rex |= (x & 8) >> 2;                /* REX.X */
516    rex |= (rm & 8) >> 3;               /* REX.B */
517
518    /* P_REXB_{R,RM} indicates that the given register is the low byte.
519       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
520       as otherwise the encoding indicates %[abcd]h.  Note that the values
521       that are ORed in merely indicate that the REX byte must be present;
522       those bits get discarded in output.  */
523    rex |= opc & (r >= 4 ? P_REXB_R : 0);
524    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
525
526    if (rex) {
527        tcg_out8(s, (uint8_t)(rex | 0x40));
528    }
529
530    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
531        tcg_out8(s, 0x0f);
532        if (opc & P_EXT38) {
533            tcg_out8(s, 0x38);
534        } else if (opc & P_EXT3A) {
535            tcg_out8(s, 0x3a);
536        }
537    }
538
539    tcg_out8(s, opc);
540}
541#else
542static void tcg_out_opc(TCGContext *s, int opc)
543{
544    if (opc & P_DATA16) {
545        tcg_out8(s, 0x66);
546    }
547    if (opc & P_SIMDF3) {
548        tcg_out8(s, 0xf3);
549    } else if (opc & P_SIMDF2) {
550        tcg_out8(s, 0xf2);
551    }
552    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
553        tcg_out8(s, 0x0f);
554        if (opc & P_EXT38) {
555            tcg_out8(s, 0x38);
556        } else if (opc & P_EXT3A) {
557            tcg_out8(s, 0x3a);
558        }
559    }
560    tcg_out8(s, opc);
561}
562/* Discard the register arguments to tcg_out_opc early, so as not to penalize
563   the 32-bit compilation paths.  This method works with all versions of gcc,
564   whereas relying on optimization may not be able to exclude them.  */
565#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
566#endif
567
568static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
569{
570    tcg_out_opc(s, opc, r, rm, 0);
571    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
572}
573
574static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
575                            int rm, int index)
576{
577    int tmp;
578
579    /* Use the two byte form if possible, which cannot encode
580       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
581    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
582        && ((rm | index) & 8) == 0) {
583        /* Two byte VEX prefix.  */
584        tcg_out8(s, 0xc5);
585
586        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
587    } else {
588        /* Three byte VEX prefix.  */
589        tcg_out8(s, 0xc4);
590
591        /* VEX.m-mmmm */
592        if (opc & P_EXT3A) {
593            tmp = 3;
594        } else if (opc & P_EXT38) {
595            tmp = 2;
596        } else if (opc & P_EXT) {
597            tmp = 1;
598        } else {
599            g_assert_not_reached();
600        }
601        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
602        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
603        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
604        tcg_out8(s, tmp);
605
606        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
607    }
608
609    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
610    /* VEX.pp */
611    if (opc & P_DATA16) {
612        tmp |= 1;                          /* 0x66 */
613    } else if (opc & P_SIMDF3) {
614        tmp |= 2;                          /* 0xf3 */
615    } else if (opc & P_SIMDF2) {
616        tmp |= 3;                          /* 0xf2 */
617    }
618    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
619    tcg_out8(s, tmp);
620    tcg_out8(s, opc);
621}
622
623static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
624{
625    tcg_out_vex_opc(s, opc, r, v, rm, 0);
626    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
627}
628
629/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
630   We handle either RM and INDEX missing with a negative value.  In 64-bit
631   mode for absolute addresses, ~RM is the size of the immediate operand
632   that will follow the instruction.  */
633
634static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
635                               int shift, intptr_t offset)
636{
637    int mod, len;
638
639    if (index < 0 && rm < 0) {
640        if (TCG_TARGET_REG_BITS == 64) {
641            /* Try for a rip-relative addressing mode.  This has replaced
642               the 32-bit-mode absolute addressing encoding.  */
643            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
644            intptr_t disp = offset - pc;
645            if (disp == (int32_t)disp) {
646                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
647                tcg_out32(s, disp);
648                return;
649            }
650
651            /* Try for an absolute address encoding.  This requires the
652               use of the MODRM+SIB encoding and is therefore larger than
653               rip-relative addressing.  */
654            if (offset == (int32_t)offset) {
655                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
656                tcg_out8(s, (4 << 3) | 5);
657                tcg_out32(s, offset);
658                return;
659            }
660
661            /* ??? The memory isn't directly addressable.  */
662            g_assert_not_reached();
663        } else {
664            /* Absolute address.  */
665            tcg_out8(s, (r << 3) | 5);
666            tcg_out32(s, offset);
667            return;
668        }
669    }
670
671    /* Find the length of the immediate addend.  Note that the encoding
672       that would be used for (%ebp) indicates absolute addressing.  */
673    if (rm < 0) {
674        mod = 0, len = 4, rm = 5;
675    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
676        mod = 0, len = 0;
677    } else if (offset == (int8_t)offset) {
678        mod = 0x40, len = 1;
679    } else {
680        mod = 0x80, len = 4;
681    }
682
683    /* Use a single byte MODRM format if possible.  Note that the encoding
684       that would be used for %esp is the escape to the two byte form.  */
685    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
686        /* Single byte MODRM format.  */
687        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
688    } else {
689        /* Two byte MODRM+SIB format.  */
690
691        /* Note that the encoding that would place %esp into the index
692           field indicates no index register.  In 64-bit mode, the REX.X
693           bit counts, so %r12 can be used as the index.  */
694        if (index < 0) {
695            index = 4;
696        } else {
697            tcg_debug_assert(index != TCG_REG_ESP);
698        }
699
700        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
701        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
702    }
703
704    if (len == 1) {
705        tcg_out8(s, offset);
706    } else if (len == 4) {
707        tcg_out32(s, offset);
708    }
709}
710
711static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
712                                     int index, int shift, intptr_t offset)
713{
714    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
715    tcg_out_sib_offset(s, r, rm, index, shift, offset);
716}
717
718static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
719                                         int rm, int index, int shift,
720                                         intptr_t offset)
721{
722    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
723    tcg_out_sib_offset(s, r, rm, index, shift, offset);
724}
725
726/* A simplification of the above with no index or shift.  */
727static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
728                                        int rm, intptr_t offset)
729{
730    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
731}
732
733static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
734                                            int v, int rm, intptr_t offset)
735{
736    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
737}
738
739/* Output an opcode with an expected reference to the constant pool.  */
740static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
741{
742    tcg_out_opc(s, opc, r, 0, 0);
743    /* Absolute for 32-bit, pc-relative for 64-bit.  */
744    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
745    tcg_out32(s, 0);
746}
747
748/* Output an opcode with an expected reference to the constant pool.  */
749static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
750{
751    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
752    /* Absolute for 32-bit, pc-relative for 64-bit.  */
753    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
754    tcg_out32(s, 0);
755}
756
757/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
758static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
759{
760    /* Propagate an opcode prefix, such as P_REXW.  */
761    int ext = subop & ~0x7;
762    subop &= 0x7;
763
764    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
765}
766
767static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
768{
769    int rexw = 0;
770
771    if (arg == ret) {
772        return true;
773    }
774    switch (type) {
775    case TCG_TYPE_I64:
776        rexw = P_REXW;
777        /* fallthru */
778    case TCG_TYPE_I32:
779        if (ret < 16) {
780            if (arg < 16) {
781                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
782            } else {
783                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
784            }
785        } else {
786            if (arg < 16) {
787                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
788            } else {
789                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
790            }
791        }
792        break;
793
794    case TCG_TYPE_V64:
795        tcg_debug_assert(ret >= 16 && arg >= 16);
796        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
797        break;
798    case TCG_TYPE_V128:
799        tcg_debug_assert(ret >= 16 && arg >= 16);
800        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
801        break;
802    case TCG_TYPE_V256:
803        tcg_debug_assert(ret >= 16 && arg >= 16);
804        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
805        break;
806
807    default:
808        g_assert_not_reached();
809    }
810    return true;
811}
812
813static const int avx2_dup_insn[4] = {
814    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
815    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
816};
817
818static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
819                            TCGReg r, TCGReg a)
820{
821    if (have_avx2) {
822        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
823        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
824    } else {
825        switch (vece) {
826        case MO_8:
827            /* ??? With zero in a register, use PSHUFB.  */
828            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
829            a = r;
830            /* FALLTHRU */
831        case MO_16:
832            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
833            a = r;
834            /* FALLTHRU */
835        case MO_32:
836            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
837            /* imm8 operand: all output lanes selected from input lane 0.  */
838            tcg_out8(s, 0);
839            break;
840        case MO_64:
841            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
842            break;
843        default:
844            g_assert_not_reached();
845        }
846    }
847    return true;
848}
849
850static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
851                             TCGReg r, TCGReg base, intptr_t offset)
852{
853    if (have_avx2) {
854        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
855        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
856                                 r, 0, base, offset);
857    } else {
858        switch (vece) {
859        case MO_64:
860            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
861            break;
862        case MO_32:
863            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
864            break;
865        case MO_16:
866            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
867            tcg_out8(s, 0); /* imm8 */
868            tcg_out_dup_vec(s, type, vece, r, r);
869            break;
870        case MO_8:
871            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
872            tcg_out8(s, 0); /* imm8 */
873            tcg_out_dup_vec(s, type, vece, r, r);
874            break;
875        default:
876            g_assert_not_reached();
877        }
878    }
879    return true;
880}
881
882static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
883                             TCGReg ret, int64_t arg)
884{
885    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
886
887    if (arg == 0) {
888        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
889        return;
890    }
891    if (arg == -1) {
892        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
893        return;
894    }
895
896    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
897        if (have_avx2) {
898            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
899        } else {
900            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
901        }
902        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
903    } else {
904        if (type == TCG_TYPE_V64) {
905            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
906        } else if (have_avx2) {
907            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
908        } else {
909            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
910        }
911        if (TCG_TARGET_REG_BITS == 64) {
912            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
913        } else {
914            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
915        }
916    }
917}
918
919static void tcg_out_movi_vec(TCGContext *s, TCGType type,
920                             TCGReg ret, tcg_target_long arg)
921{
922    if (arg == 0) {
923        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
924        return;
925    }
926    if (arg == -1) {
927        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
928        return;
929    }
930
931    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
932    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
933    if (TCG_TARGET_REG_BITS == 64) {
934        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
935    } else {
936        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
937    }
938}
939
940static void tcg_out_movi_int(TCGContext *s, TCGType type,
941                             TCGReg ret, tcg_target_long arg)
942{
943    tcg_target_long diff;
944
945    if (arg == 0) {
946        tgen_arithr(s, ARITH_XOR, ret, ret);
947        return;
948    }
949    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
950        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
951        tcg_out32(s, arg);
952        return;
953    }
954    if (arg == (int32_t)arg) {
955        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
956        tcg_out32(s, arg);
957        return;
958    }
959
960    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
961    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
962    if (diff == (int32_t)diff) {
963        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
964        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
965        tcg_out32(s, diff);
966        return;
967    }
968
969    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
970    tcg_out64(s, arg);
971}
972
973static void tcg_out_movi(TCGContext *s, TCGType type,
974                         TCGReg ret, tcg_target_long arg)
975{
976    switch (type) {
977    case TCG_TYPE_I32:
978#if TCG_TARGET_REG_BITS == 64
979    case TCG_TYPE_I64:
980#endif
981        if (ret < 16) {
982            tcg_out_movi_int(s, type, ret, arg);
983        } else {
984            tcg_out_movi_vec(s, type, ret, arg);
985        }
986        break;
987    default:
988        g_assert_not_reached();
989    }
990}
991
992static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
993{
994    if (val == (int8_t)val) {
995        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
996        tcg_out8(s, val);
997    } else if (val == (int32_t)val) {
998        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
999        tcg_out32(s, val);
1000    } else {
1001        tcg_abort();
1002    }
1003}
1004
1005static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1006{
1007    /* Given the strength of x86 memory ordering, we only need care for
1008       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1009       faster than "mfence", so don't bother with the sse insn.  */
1010    if (a0 & TCG_MO_ST_LD) {
1011        tcg_out8(s, 0xf0);
1012        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1013        tcg_out8(s, 0);
1014    }
1015}
1016
1017static inline void tcg_out_push(TCGContext *s, int reg)
1018{
1019    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1020}
1021
1022static inline void tcg_out_pop(TCGContext *s, int reg)
1023{
1024    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1025}
1026
1027static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1028                       TCGReg arg1, intptr_t arg2)
1029{
1030    switch (type) {
1031    case TCG_TYPE_I32:
1032        if (ret < 16) {
1033            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1034        } else {
1035            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1036        }
1037        break;
1038    case TCG_TYPE_I64:
1039        if (ret < 16) {
1040            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1041            break;
1042        }
1043        /* FALLTHRU */
1044    case TCG_TYPE_V64:
1045        /* There is no instruction that can validate 8-byte alignment.  */
1046        tcg_debug_assert(ret >= 16);
1047        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1048        break;
1049    case TCG_TYPE_V128:
1050        /*
1051         * The gvec infrastructure is asserts that v128 vector loads
1052         * and stores use a 16-byte aligned offset.  Validate that the
1053         * final pointer is aligned by using an insn that will SIGSEGV.
1054         */
1055        tcg_debug_assert(ret >= 16);
1056        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1057        break;
1058    case TCG_TYPE_V256:
1059        /*
1060         * The gvec infrastructure only requires 16-byte alignment,
1061         * so here we must use an unaligned load.
1062         */
1063        tcg_debug_assert(ret >= 16);
1064        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1065                                 ret, 0, arg1, arg2);
1066        break;
1067    default:
1068        g_assert_not_reached();
1069    }
1070}
1071
1072static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1073                       TCGReg arg1, intptr_t arg2)
1074{
1075    switch (type) {
1076    case TCG_TYPE_I32:
1077        if (arg < 16) {
1078            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1079        } else {
1080            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1081        }
1082        break;
1083    case TCG_TYPE_I64:
1084        if (arg < 16) {
1085            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1086            break;
1087        }
1088        /* FALLTHRU */
1089    case TCG_TYPE_V64:
1090        /* There is no instruction that can validate 8-byte alignment.  */
1091        tcg_debug_assert(arg >= 16);
1092        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1093        break;
1094    case TCG_TYPE_V128:
1095        /*
1096         * The gvec infrastructure is asserts that v128 vector loads
1097         * and stores use a 16-byte aligned offset.  Validate that the
1098         * final pointer is aligned by using an insn that will SIGSEGV.
1099         */
1100        tcg_debug_assert(arg >= 16);
1101        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1102        break;
1103    case TCG_TYPE_V256:
1104        /*
1105         * The gvec infrastructure only requires 16-byte alignment,
1106         * so here we must use an unaligned store.
1107         */
1108        tcg_debug_assert(arg >= 16);
1109        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1110                                 arg, 0, arg1, arg2);
1111        break;
1112    default:
1113        g_assert_not_reached();
1114    }
1115}
1116
1117static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1118                        TCGReg base, intptr_t ofs)
1119{
1120    int rexw = 0;
1121    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1122        if (val != (int32_t)val) {
1123            return false;
1124        }
1125        rexw = P_REXW;
1126    } else if (type != TCG_TYPE_I32) {
1127        return false;
1128    }
1129    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1130    tcg_out32(s, val);
1131    return true;
1132}
1133
1134static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1135{
1136    /* Propagate an opcode prefix, such as P_DATA16.  */
1137    int ext = subopc & ~0x7;
1138    subopc &= 0x7;
1139
1140    if (count == 1) {
1141        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1142    } else {
1143        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1144        tcg_out8(s, count);
1145    }
1146}
1147
1148static inline void tcg_out_bswap32(TCGContext *s, int reg)
1149{
1150    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1151}
1152
1153static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1154{
1155    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1156}
1157
1158static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1159{
1160    /* movzbl */
1161    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1162    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1163}
1164
1165static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1166{
1167    /* movsbl */
1168    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1169    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1170}
1171
1172static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1173{
1174    /* movzwl */
1175    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1176}
1177
1178static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1179{
1180    /* movsw[lq] */
1181    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1182}
1183
1184static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1185{
1186    /* 32-bit mov zero extends.  */
1187    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1188}
1189
1190static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1191{
1192    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1193}
1194
1195static inline void tcg_out_bswap64(TCGContext *s, int reg)
1196{
1197    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1198}
1199
1200static void tgen_arithi(TCGContext *s, int c, int r0,
1201                        tcg_target_long val, int cf)
1202{
1203    int rexw = 0;
1204
1205    if (TCG_TARGET_REG_BITS == 64) {
1206        rexw = c & -8;
1207        c &= 7;
1208    }
1209
1210    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1211       partial flags update stalls on Pentium4 and are not recommended
1212       by current Intel optimization manuals.  */
1213    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1214        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1215        if (TCG_TARGET_REG_BITS == 64) {
1216            /* The single-byte increment encodings are re-tasked as the
1217               REX prefixes.  Use the MODRM encoding.  */
1218            tcg_out_modrm(s, OPC_GRP5 + rexw,
1219                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1220        } else {
1221            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1222        }
1223        return;
1224    }
1225
1226    if (c == ARITH_AND) {
1227        if (TCG_TARGET_REG_BITS == 64) {
1228            if (val == 0xffffffffu) {
1229                tcg_out_ext32u(s, r0, r0);
1230                return;
1231            }
1232            if (val == (uint32_t)val) {
1233                /* AND with no high bits set can use a 32-bit operation.  */
1234                rexw = 0;
1235            }
1236        }
1237        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1238            tcg_out_ext8u(s, r0, r0);
1239            return;
1240        }
1241        if (val == 0xffffu) {
1242            tcg_out_ext16u(s, r0, r0);
1243            return;
1244        }
1245    }
1246
1247    if (val == (int8_t)val) {
1248        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1249        tcg_out8(s, val);
1250        return;
1251    }
1252    if (rexw == 0 || val == (int32_t)val) {
1253        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1254        tcg_out32(s, val);
1255        return;
1256    }
1257
1258    tcg_abort();
1259}
1260
1261static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1262{
1263    if (val != 0) {
1264        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1265    }
1266}
1267
1268/* Use SMALL != 0 to force a short forward branch.  */
1269static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1270{
1271    int32_t val, val1;
1272
1273    if (l->has_value) {
1274        val = tcg_pcrel_diff(s, l->u.value_ptr);
1275        val1 = val - 2;
1276        if ((int8_t)val1 == val1) {
1277            if (opc == -1) {
1278                tcg_out8(s, OPC_JMP_short);
1279            } else {
1280                tcg_out8(s, OPC_JCC_short + opc);
1281            }
1282            tcg_out8(s, val1);
1283        } else {
1284            if (small) {
1285                tcg_abort();
1286            }
1287            if (opc == -1) {
1288                tcg_out8(s, OPC_JMP_long);
1289                tcg_out32(s, val - 5);
1290            } else {
1291                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1292                tcg_out32(s, val - 6);
1293            }
1294        }
1295    } else if (small) {
1296        if (opc == -1) {
1297            tcg_out8(s, OPC_JMP_short);
1298        } else {
1299            tcg_out8(s, OPC_JCC_short + opc);
1300        }
1301        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1302        s->code_ptr += 1;
1303    } else {
1304        if (opc == -1) {
1305            tcg_out8(s, OPC_JMP_long);
1306        } else {
1307            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1308        }
1309        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1310        s->code_ptr += 4;
1311    }
1312}
1313
1314static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1315                        int const_arg2, int rexw)
1316{
1317    if (const_arg2) {
1318        if (arg2 == 0) {
1319            /* test r, r */
1320            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1321        } else {
1322            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1323        }
1324    } else {
1325        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1326    }
1327}
1328
1329static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1330                             TCGArg arg1, TCGArg arg2, int const_arg2,
1331                             TCGLabel *label, int small)
1332{
1333    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1334    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1335}
1336
1337#if TCG_TARGET_REG_BITS == 64
1338static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1339                             TCGArg arg1, TCGArg arg2, int const_arg2,
1340                             TCGLabel *label, int small)
1341{
1342    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1343    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1344}
1345#else
1346/* XXX: we implement it at the target level to avoid having to
1347   handle cross basic blocks temporaries */
1348static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1349                            const int *const_args, int small)
1350{
1351    TCGLabel *label_next = gen_new_label();
1352    TCGLabel *label_this = arg_label(args[5]);
1353
1354    switch(args[4]) {
1355    case TCG_COND_EQ:
1356        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1357                         label_next, 1);
1358        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1359                         label_this, small);
1360        break;
1361    case TCG_COND_NE:
1362        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1363                         label_this, small);
1364        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1365                         label_this, small);
1366        break;
1367    case TCG_COND_LT:
1368        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1369                         label_this, small);
1370        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1372                         label_this, small);
1373        break;
1374    case TCG_COND_LE:
1375        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1376                         label_this, small);
1377        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1378        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1379                         label_this, small);
1380        break;
1381    case TCG_COND_GT:
1382        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1383                         label_this, small);
1384        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1385        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1386                         label_this, small);
1387        break;
1388    case TCG_COND_GE:
1389        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1390                         label_this, small);
1391        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1392        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1393                         label_this, small);
1394        break;
1395    case TCG_COND_LTU:
1396        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1397                         label_this, small);
1398        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1399        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1400                         label_this, small);
1401        break;
1402    case TCG_COND_LEU:
1403        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1404                         label_this, small);
1405        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1406        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1407                         label_this, small);
1408        break;
1409    case TCG_COND_GTU:
1410        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1411                         label_this, small);
1412        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1413        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1414                         label_this, small);
1415        break;
1416    case TCG_COND_GEU:
1417        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1418                         label_this, small);
1419        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1420        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1421                         label_this, small);
1422        break;
1423    default:
1424        tcg_abort();
1425    }
1426    tcg_out_label(s, label_next);
1427}
1428#endif
1429
1430static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1431                              TCGArg arg1, TCGArg arg2, int const_arg2)
1432{
1433    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1434    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1435    tcg_out_ext8u(s, dest, dest);
1436}
1437
1438#if TCG_TARGET_REG_BITS == 64
1439static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1440                              TCGArg arg1, TCGArg arg2, int const_arg2)
1441{
1442    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1443    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1444    tcg_out_ext8u(s, dest, dest);
1445}
1446#else
1447static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1448                             const int *const_args)
1449{
1450    TCGArg new_args[6];
1451    TCGLabel *label_true, *label_over;
1452
1453    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1454
1455    if (args[0] == args[1] || args[0] == args[2]
1456        || (!const_args[3] && args[0] == args[3])
1457        || (!const_args[4] && args[0] == args[4])) {
1458        /* When the destination overlaps with one of the argument
1459           registers, don't do anything tricky.  */
1460        label_true = gen_new_label();
1461        label_over = gen_new_label();
1462
1463        new_args[5] = label_arg(label_true);
1464        tcg_out_brcond2(s, new_args, const_args+1, 1);
1465
1466        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1467        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1468        tcg_out_label(s, label_true);
1469
1470        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1471        tcg_out_label(s, label_over);
1472    } else {
1473        /* When the destination does not overlap one of the arguments,
1474           clear the destination first, jump if cond false, and emit an
1475           increment in the true case.  This results in smaller code.  */
1476
1477        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1478
1479        label_over = gen_new_label();
1480        new_args[4] = tcg_invert_cond(new_args[4]);
1481        new_args[5] = label_arg(label_over);
1482        tcg_out_brcond2(s, new_args, const_args+1, 1);
1483
1484        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1485        tcg_out_label(s, label_over);
1486    }
1487}
1488#endif
1489
1490static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1491                         TCGReg dest, TCGReg v1)
1492{
1493    if (have_cmov) {
1494        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1495    } else {
1496        TCGLabel *over = gen_new_label();
1497        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1498        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1499        tcg_out_label(s, over);
1500    }
1501}
1502
1503static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1504                              TCGReg c1, TCGArg c2, int const_c2,
1505                              TCGReg v1)
1506{
1507    tcg_out_cmp(s, c1, c2, const_c2, 0);
1508    tcg_out_cmov(s, cond, 0, dest, v1);
1509}
1510
1511#if TCG_TARGET_REG_BITS == 64
1512static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1513                              TCGReg c1, TCGArg c2, int const_c2,
1514                              TCGReg v1)
1515{
1516    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1517    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1518}
1519#endif
1520
1521static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1522                        TCGArg arg2, bool const_a2)
1523{
1524    if (have_bmi1) {
1525        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1526        if (const_a2) {
1527            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1528        } else {
1529            tcg_debug_assert(dest != arg2);
1530            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1531        }
1532    } else {
1533        tcg_debug_assert(dest != arg2);
1534        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1535        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1536    }
1537}
1538
1539static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1540                        TCGArg arg2, bool const_a2)
1541{
1542    if (have_lzcnt) {
1543        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1544        if (const_a2) {
1545            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1546        } else {
1547            tcg_debug_assert(dest != arg2);
1548            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1549        }
1550    } else {
1551        tcg_debug_assert(!const_a2);
1552        tcg_debug_assert(dest != arg1);
1553        tcg_debug_assert(dest != arg2);
1554
1555        /* Recall that the output of BSR is the index not the count.  */
1556        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1557        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1558
1559        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1560        tcg_out_cmp(s, arg1, 0, 1, rexw);
1561        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1562    }
1563}
1564
1565static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1566{
1567    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1568
1569    if (disp == (int32_t)disp) {
1570        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1571        tcg_out32(s, disp);
1572    } else {
1573        /* rip-relative addressing into the constant pool.
1574           This is 6 + 8 = 14 bytes, as compared to using an
1575           an immediate load 10 + 6 = 16 bytes, plus we may
1576           be able to re-use the pool constant for more calls.  */
1577        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1578        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1579        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1580        tcg_out32(s, 0);
1581    }
1582}
1583
1584static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1585{
1586    tcg_out_branch(s, 1, dest);
1587}
1588
1589static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1590{
1591    tcg_out_branch(s, 0, dest);
1592}
1593
1594static void tcg_out_nopn(TCGContext *s, int n)
1595{
1596    int i;
1597    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1598     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1599     * duplicate prefix, and all of the interesting recent cores can
1600     * decode and discard the duplicates in a single cycle.
1601     */
1602    tcg_debug_assert(n >= 1);
1603    for (i = 1; i < n; ++i) {
1604        tcg_out8(s, 0x66);
1605    }
1606    tcg_out8(s, 0x90);
1607}
1608
1609#if defined(CONFIG_SOFTMMU)
1610#include "../tcg-ldst.c.inc"
1611
1612/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1613 *                                     int mmu_idx, uintptr_t ra)
1614 */
1615static void * const qemu_ld_helpers[16] = {
1616    [MO_UB]   = helper_ret_ldub_mmu,
1617    [MO_LEUW] = helper_le_lduw_mmu,
1618    [MO_LEUL] = helper_le_ldul_mmu,
1619    [MO_LEQ]  = helper_le_ldq_mmu,
1620    [MO_BEUW] = helper_be_lduw_mmu,
1621    [MO_BEUL] = helper_be_ldul_mmu,
1622    [MO_BEQ]  = helper_be_ldq_mmu,
1623};
1624
1625/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1626 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1627 */
1628static void * const qemu_st_helpers[16] = {
1629    [MO_UB]   = helper_ret_stb_mmu,
1630    [MO_LEUW] = helper_le_stw_mmu,
1631    [MO_LEUL] = helper_le_stl_mmu,
1632    [MO_LEQ]  = helper_le_stq_mmu,
1633    [MO_BEUW] = helper_be_stw_mmu,
1634    [MO_BEUL] = helper_be_stl_mmu,
1635    [MO_BEQ]  = helper_be_stq_mmu,
1636};
1637
1638/* Perform the TLB load and compare.
1639
1640   Inputs:
1641   ADDRLO and ADDRHI contain the low and high part of the address.
1642
1643   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644
1645   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1646   This should be offsetof addr_read or addr_write.
1647
1648   Outputs:
1649   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1650   positions of the displacements of forward jumps to the TLB miss case.
1651
1652   Second argument register is loaded with the low part of the address.
1653   In the TLB hit case, it has been adjusted as indicated by the TLB
1654   and so is a host address.  In the TLB miss case, it continues to
1655   hold a guest address.
1656
1657   First argument register is clobbered.  */
1658
1659static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1660                                    int mem_index, MemOp opc,
1661                                    tcg_insn_unit **label_ptr, int which)
1662{
1663    const TCGReg r0 = TCG_REG_L0;
1664    const TCGReg r1 = TCG_REG_L1;
1665    TCGType ttype = TCG_TYPE_I32;
1666    TCGType tlbtype = TCG_TYPE_I32;
1667    int trexw = 0, hrexw = 0, tlbrexw = 0;
1668    unsigned a_bits = get_alignment_bits(opc);
1669    unsigned s_bits = opc & MO_SIZE;
1670    unsigned a_mask = (1 << a_bits) - 1;
1671    unsigned s_mask = (1 << s_bits) - 1;
1672    target_ulong tlb_mask;
1673
1674    if (TCG_TARGET_REG_BITS == 64) {
1675        if (TARGET_LONG_BITS == 64) {
1676            ttype = TCG_TYPE_I64;
1677            trexw = P_REXW;
1678        }
1679        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680            hrexw = P_REXW;
1681            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1682                tlbtype = TCG_TYPE_I64;
1683                tlbrexw = P_REXW;
1684            }
1685        }
1686    }
1687
1688    tcg_out_mov(s, tlbtype, r0, addrlo);
1689    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1690                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691
1692    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1693                         TLB_MASK_TABLE_OFS(mem_index) +
1694                         offsetof(CPUTLBDescFast, mask));
1695
1696    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1697                         TLB_MASK_TABLE_OFS(mem_index) +
1698                         offsetof(CPUTLBDescFast, table));
1699
1700    /* If the required alignment is at least as large as the access, simply
1701       copy the address and mask.  For lesser alignments, check that we don't
1702       cross pages for the complete access.  */
1703    if (a_bits >= s_bits) {
1704        tcg_out_mov(s, ttype, r1, addrlo);
1705    } else {
1706        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707    }
1708    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1709    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1710
1711    /* cmp 0(r0), r1 */
1712    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713
1714    /* Prepare for both the fast path add of the tlb addend, and the slow
1715       path function argument setup.  */
1716    tcg_out_mov(s, ttype, r1, addrlo);
1717
1718    /* jne slow_path */
1719    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1720    label_ptr[0] = s->code_ptr;
1721    s->code_ptr += 4;
1722
1723    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1724        /* cmp 4(r0), addrhi */
1725        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1726
1727        /* jne slow_path */
1728        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1729        label_ptr[1] = s->code_ptr;
1730        s->code_ptr += 4;
1731    }
1732
1733    /* TLB Hit.  */
1734
1735    /* add addend(r0), r1 */
1736    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1737                         offsetof(CPUTLBEntry, addend));
1738}
1739
1740/*
1741 * Record the context of a call to the out of line helper code for the slow path
1742 * for a load or store, so that we can later generate the correct helper code
1743 */
1744static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745                                TCGMemOpIdx oi,
1746                                TCGReg datalo, TCGReg datahi,
1747                                TCGReg addrlo, TCGReg addrhi,
1748                                tcg_insn_unit *raddr,
1749                                tcg_insn_unit **label_ptr)
1750{
1751    TCGLabelQemuLdst *label = new_ldst_label(s);
1752
1753    label->is_ld = is_ld;
1754    label->oi = oi;
1755    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1756    label->datalo_reg = datalo;
1757    label->datahi_reg = datahi;
1758    label->addrlo_reg = addrlo;
1759    label->addrhi_reg = addrhi;
1760    label->raddr = tcg_splitwx_to_rx(raddr);
1761    label->label_ptr[0] = label_ptr[0];
1762    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1763        label->label_ptr[1] = label_ptr[1];
1764    }
1765}
1766
1767/*
1768 * Generate code for the slow path for a load at the end of block
1769 */
1770static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771{
1772    TCGMemOpIdx oi = l->oi;
1773    MemOp opc = get_memop(oi);
1774    TCGReg data_reg;
1775    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1776    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777
1778    /* resolve label address */
1779    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1780    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1781        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1782    }
1783
1784    if (TCG_TARGET_REG_BITS == 32) {
1785        int ofs = 0;
1786
1787        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1788        ofs += 4;
1789
1790        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1791        ofs += 4;
1792
1793        if (TARGET_LONG_BITS == 64) {
1794            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1795            ofs += 4;
1796        }
1797
1798        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1799        ofs += 4;
1800
1801        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802    } else {
1803        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1804        /* The second argument is already loaded with addrlo.  */
1805        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1806        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1807                     (uintptr_t)l->raddr);
1808    }
1809
1810    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811
1812    data_reg = l->datalo_reg;
1813    switch (opc & MO_SSIZE) {
1814    case MO_SB:
1815        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1816        break;
1817    case MO_SW:
1818        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819        break;
1820#if TCG_TARGET_REG_BITS == 64
1821    case MO_SL:
1822        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1823        break;
1824#endif
1825    case MO_UB:
1826    case MO_UW:
1827        /* Note that the helpers have zero-extended to tcg_target_long.  */
1828    case MO_UL:
1829        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1830        break;
1831    case MO_Q:
1832        if (TCG_TARGET_REG_BITS == 64) {
1833            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1834        } else if (data_reg == TCG_REG_EDX) {
1835            /* xchg %edx, %eax */
1836            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1837            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838        } else {
1839            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1840            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1841        }
1842        break;
1843    default:
1844        tcg_abort();
1845    }
1846
1847    /* Jump to the code corresponding to next IR of qemu_st */
1848    tcg_out_jmp(s, l->raddr);
1849    return true;
1850}
1851
1852/*
1853 * Generate code for the slow path for a store at the end of block
1854 */
1855static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856{
1857    TCGMemOpIdx oi = l->oi;
1858    MemOp opc = get_memop(oi);
1859    MemOp s_bits = opc & MO_SIZE;
1860    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1861    TCGReg retaddr;
1862
1863    /* resolve label address */
1864    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1865    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1866        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1867    }
1868
1869    if (TCG_TARGET_REG_BITS == 32) {
1870        int ofs = 0;
1871
1872        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1873        ofs += 4;
1874
1875        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1876        ofs += 4;
1877
1878        if (TARGET_LONG_BITS == 64) {
1879            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1880            ofs += 4;
1881        }
1882
1883        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1884        ofs += 4;
1885
1886        if (s_bits == MO_64) {
1887            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1888            ofs += 4;
1889        }
1890
1891        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1892        ofs += 4;
1893
1894        retaddr = TCG_REG_EAX;
1895        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1896        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897    } else {
1898        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1899        /* The second argument is already loaded with addrlo.  */
1900        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1901                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1902        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903
1904        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1905            retaddr = tcg_target_call_iarg_regs[4];
1906            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907        } else {
1908            retaddr = TCG_REG_RAX;
1909            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1910            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1911                       TCG_TARGET_CALL_STACK_OFFSET);
1912        }
1913    }
1914
1915    /* "Tail call" to the helper, with the return address back inline.  */
1916    tcg_out_push(s, retaddr);
1917    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1918    return true;
1919}
1920#elif TCG_TARGET_REG_BITS == 32
1921# define x86_guest_base_seg     0
1922# define x86_guest_base_index   -1
1923# define x86_guest_base_offset  guest_base
1924#else
1925static int x86_guest_base_seg;
1926static int x86_guest_base_index = -1;
1927static int32_t x86_guest_base_offset;
1928# if defined(__x86_64__) && defined(__linux__)
1929#  include <asm/prctl.h>
1930#  include <sys/prctl.h>
1931int arch_prctl(int code, unsigned long addr);
1932static inline int setup_guest_base_seg(void)
1933{
1934    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1935        return P_GS;
1936    }
1937    return 0;
1938}
1939# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1940#  include <machine/sysarch.h>
1941static inline int setup_guest_base_seg(void)
1942{
1943    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1944        return P_GS;
1945    }
1946    return 0;
1947}
1948# else
1949static inline int setup_guest_base_seg(void)
1950{
1951    return 0;
1952}
1953# endif
1954#endif /* SOFTMMU */
1955
1956static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1957                                   TCGReg base, int index, intptr_t ofs,
1958                                   int seg, bool is64, MemOp memop)
1959{
1960    bool use_movbe = false;
1961    int rexw = is64 * P_REXW;
1962    int movop = OPC_MOVL_GvEv;
1963
1964    /* Do big-endian loads with movbe.  */
1965    if (memop & MO_BSWAP) {
1966        tcg_debug_assert(have_movbe);
1967        use_movbe = true;
1968        movop = OPC_MOVBE_GyMy;
1969    }
1970
1971    switch (memop & MO_SSIZE) {
1972    case MO_UB:
1973        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1974                                 base, index, 0, ofs);
1975        break;
1976    case MO_SB:
1977        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1978                                 base, index, 0, ofs);
1979        break;
1980    case MO_UW:
1981        if (use_movbe) {
1982            /* There is no extending movbe; only low 16-bits are modified.  */
1983            if (datalo != base && datalo != index) {
1984                /* XOR breaks dependency chains.  */
1985                tgen_arithr(s, ARITH_XOR, datalo, datalo);
1986                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1987                                         datalo, base, index, 0, ofs);
1988            } else {
1989                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1990                                         datalo, base, index, 0, ofs);
1991                tcg_out_ext16u(s, datalo, datalo);
1992            }
1993        } else {
1994            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1995                                     base, index, 0, ofs);
1996        }
1997        break;
1998    case MO_SW:
1999        if (use_movbe) {
2000            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2001                                     datalo, base, index, 0, ofs);
2002            tcg_out_ext16s(s, datalo, datalo, rexw);
2003        } else {
2004            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2005                                     datalo, base, index, 0, ofs);
2006        }
2007        break;
2008    case MO_UL:
2009        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2010        break;
2011#if TCG_TARGET_REG_BITS == 64
2012    case MO_SL:
2013        if (use_movbe) {
2014            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2015                                     base, index, 0, ofs);
2016            tcg_out_ext32s(s, datalo, datalo);
2017        } else {
2018            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2019                                     base, index, 0, ofs);
2020        }
2021        break;
2022#endif
2023    case MO_Q:
2024        if (TCG_TARGET_REG_BITS == 64) {
2025            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2026                                     base, index, 0, ofs);
2027        } else {
2028            if (use_movbe) {
2029                TCGReg t = datalo;
2030                datalo = datahi;
2031                datahi = t;
2032            }
2033            if (base != datalo) {
2034                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2035                                         base, index, 0, ofs);
2036                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2037                                         base, index, 0, ofs + 4);
2038            } else {
2039                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2040                                         base, index, 0, ofs + 4);
2041                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2042                                         base, index, 0, ofs);
2043            }
2044        }
2045        break;
2046    default:
2047        g_assert_not_reached();
2048    }
2049}
2050
2051/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2052   EAX. It will be useful once fixed registers globals are less
2053   common. */
2054static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2055{
2056    TCGReg datalo, datahi, addrlo;
2057    TCGReg addrhi __attribute__((unused));
2058    TCGMemOpIdx oi;
2059    MemOp opc;
2060#if defined(CONFIG_SOFTMMU)
2061    int mem_index;
2062    tcg_insn_unit *label_ptr[2];
2063#endif
2064
2065    datalo = *args++;
2066    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2067    addrlo = *args++;
2068    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2069    oi = *args++;
2070    opc = get_memop(oi);
2071
2072#if defined(CONFIG_SOFTMMU)
2073    mem_index = get_mmuidx(oi);
2074
2075    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2076                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2077
2078    /* TLB Hit.  */
2079    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2080
2081    /* Record the current context of a load into ldst label */
2082    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2083                        s->code_ptr, label_ptr);
2084#else
2085    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2086                           x86_guest_base_offset, x86_guest_base_seg,
2087                           is64, opc);
2088#endif
2089}
2090
2091static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2092                                   TCGReg base, int index, intptr_t ofs,
2093                                   int seg, MemOp memop)
2094{
2095    bool use_movbe = false;
2096    int movop = OPC_MOVL_EvGv;
2097
2098    /*
2099     * Do big-endian stores with movbe or softmmu.
2100     * User-only without movbe will have its swapping done generically.
2101     */
2102    if (memop & MO_BSWAP) {
2103        tcg_debug_assert(have_movbe);
2104        use_movbe = true;
2105        movop = OPC_MOVBE_MyGy;
2106    }
2107
2108    switch (memop & MO_SIZE) {
2109    case MO_8:
2110        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2111        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2112        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2113                                 datalo, base, index, 0, ofs);
2114        break;
2115    case MO_16:
2116        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2117                                 base, index, 0, ofs);
2118        break;
2119    case MO_32:
2120        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2121        break;
2122    case MO_64:
2123        if (TCG_TARGET_REG_BITS == 64) {
2124            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2125                                     base, index, 0, ofs);
2126        } else {
2127            if (use_movbe) {
2128                TCGReg t = datalo;
2129                datalo = datahi;
2130                datahi = t;
2131            }
2132            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2133                                     base, index, 0, ofs);
2134            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2135                                     base, index, 0, ofs + 4);
2136        }
2137        break;
2138    default:
2139        g_assert_not_reached();
2140    }
2141}
2142
2143static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2144{
2145    TCGReg datalo, datahi, addrlo;
2146    TCGReg addrhi __attribute__((unused));
2147    TCGMemOpIdx oi;
2148    MemOp opc;
2149#if defined(CONFIG_SOFTMMU)
2150    int mem_index;
2151    tcg_insn_unit *label_ptr[2];
2152#endif
2153
2154    datalo = *args++;
2155    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2156    addrlo = *args++;
2157    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2158    oi = *args++;
2159    opc = get_memop(oi);
2160
2161#if defined(CONFIG_SOFTMMU)
2162    mem_index = get_mmuidx(oi);
2163
2164    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2165                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2166
2167    /* TLB Hit.  */
2168    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2169
2170    /* Record the current context of a store into ldst label */
2171    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2172                        s->code_ptr, label_ptr);
2173#else
2174    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2175                           x86_guest_base_offset, x86_guest_base_seg, opc);
2176#endif
2177}
2178
2179static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2180                              const TCGArg args[TCG_MAX_OP_ARGS],
2181                              const int const_args[TCG_MAX_OP_ARGS])
2182{
2183    TCGArg a0, a1, a2;
2184    int c, const_a2, vexop, rexw = 0;
2185
2186#if TCG_TARGET_REG_BITS == 64
2187# define OP_32_64(x) \
2188        case glue(glue(INDEX_op_, x), _i64): \
2189            rexw = P_REXW; /* FALLTHRU */    \
2190        case glue(glue(INDEX_op_, x), _i32)
2191#else
2192# define OP_32_64(x) \
2193        case glue(glue(INDEX_op_, x), _i32)
2194#endif
2195
2196    /* Hoist the loads of the most common arguments.  */
2197    a0 = args[0];
2198    a1 = args[1];
2199    a2 = args[2];
2200    const_a2 = const_args[2];
2201
2202    switch (opc) {
2203    case INDEX_op_exit_tb:
2204        /* Reuse the zeroing that exists for goto_ptr.  */
2205        if (a0 == 0) {
2206            tcg_out_jmp(s, tcg_code_gen_epilogue);
2207        } else {
2208            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2209            tcg_out_jmp(s, tb_ret_addr);
2210        }
2211        break;
2212    case INDEX_op_goto_tb:
2213        if (s->tb_jmp_insn_offset) {
2214            /* direct jump method */
2215            int gap;
2216            /* jump displacement must be aligned for atomic patching;
2217             * see if we need to add extra nops before jump
2218             */
2219            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2220            if (gap != 1) {
2221                tcg_out_nopn(s, gap - 1);
2222            }
2223            tcg_out8(s, OPC_JMP_long); /* jmp im */
2224            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2225            tcg_out32(s, 0);
2226        } else {
2227            /* indirect jump method */
2228            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2229                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2230        }
2231        set_jmp_reset_offset(s, a0);
2232        break;
2233    case INDEX_op_goto_ptr:
2234        /* jmp to the given host address (could be epilogue) */
2235        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2236        break;
2237    case INDEX_op_br:
2238        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2239        break;
2240    OP_32_64(ld8u):
2241        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2242        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2243        break;
2244    OP_32_64(ld8s):
2245        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2246        break;
2247    OP_32_64(ld16u):
2248        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2249        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2250        break;
2251    OP_32_64(ld16s):
2252        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2253        break;
2254#if TCG_TARGET_REG_BITS == 64
2255    case INDEX_op_ld32u_i64:
2256#endif
2257    case INDEX_op_ld_i32:
2258        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2259        break;
2260
2261    OP_32_64(st8):
2262        if (const_args[0]) {
2263            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2264            tcg_out8(s, a0);
2265        } else {
2266            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2267        }
2268        break;
2269    OP_32_64(st16):
2270        if (const_args[0]) {
2271            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2272            tcg_out16(s, a0);
2273        } else {
2274            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2275        }
2276        break;
2277#if TCG_TARGET_REG_BITS == 64
2278    case INDEX_op_st32_i64:
2279#endif
2280    case INDEX_op_st_i32:
2281        if (const_args[0]) {
2282            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2283            tcg_out32(s, a0);
2284        } else {
2285            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2286        }
2287        break;
2288
2289    OP_32_64(add):
2290        /* For 3-operand addition, use LEA.  */
2291        if (a0 != a1) {
2292            TCGArg c3 = 0;
2293            if (const_a2) {
2294                c3 = a2, a2 = -1;
2295            } else if (a0 == a2) {
2296                /* Watch out for dest = src + dest, since we've removed
2297                   the matching constraint on the add.  */
2298                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2299                break;
2300            }
2301
2302            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2303            break;
2304        }
2305        c = ARITH_ADD;
2306        goto gen_arith;
2307    OP_32_64(sub):
2308        c = ARITH_SUB;
2309        goto gen_arith;
2310    OP_32_64(and):
2311        c = ARITH_AND;
2312        goto gen_arith;
2313    OP_32_64(or):
2314        c = ARITH_OR;
2315        goto gen_arith;
2316    OP_32_64(xor):
2317        c = ARITH_XOR;
2318        goto gen_arith;
2319    gen_arith:
2320        if (const_a2) {
2321            tgen_arithi(s, c + rexw, a0, a2, 0);
2322        } else {
2323            tgen_arithr(s, c + rexw, a0, a2);
2324        }
2325        break;
2326
2327    OP_32_64(andc):
2328        if (const_a2) {
2329            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2330            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2331        } else {
2332            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2333        }
2334        break;
2335
2336    OP_32_64(mul):
2337        if (const_a2) {
2338            int32_t val;
2339            val = a2;
2340            if (val == (int8_t)val) {
2341                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2342                tcg_out8(s, val);
2343            } else {
2344                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2345                tcg_out32(s, val);
2346            }
2347        } else {
2348            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2349        }
2350        break;
2351
2352    OP_32_64(div2):
2353        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2354        break;
2355    OP_32_64(divu2):
2356        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2357        break;
2358
2359    OP_32_64(shl):
2360        /* For small constant 3-operand shift, use LEA.  */
2361        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2362            if (a2 - 1 == 0) {
2363                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2364                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2365            } else {
2366                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2367                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2368            }
2369            break;
2370        }
2371        c = SHIFT_SHL;
2372        vexop = OPC_SHLX;
2373        goto gen_shift_maybe_vex;
2374    OP_32_64(shr):
2375        c = SHIFT_SHR;
2376        vexop = OPC_SHRX;
2377        goto gen_shift_maybe_vex;
2378    OP_32_64(sar):
2379        c = SHIFT_SAR;
2380        vexop = OPC_SARX;
2381        goto gen_shift_maybe_vex;
2382    OP_32_64(rotl):
2383        c = SHIFT_ROL;
2384        goto gen_shift;
2385    OP_32_64(rotr):
2386        c = SHIFT_ROR;
2387        goto gen_shift;
2388    gen_shift_maybe_vex:
2389        if (have_bmi2) {
2390            if (!const_a2) {
2391                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2392                break;
2393            }
2394            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2395        }
2396        /* FALLTHRU */
2397    gen_shift:
2398        if (const_a2) {
2399            tcg_out_shifti(s, c + rexw, a0, a2);
2400        } else {
2401            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2402        }
2403        break;
2404
2405    OP_32_64(ctz):
2406        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2407        break;
2408    OP_32_64(clz):
2409        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2410        break;
2411    OP_32_64(ctpop):
2412        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2413        break;
2414
2415    case INDEX_op_brcond_i32:
2416        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2417        break;
2418    case INDEX_op_setcond_i32:
2419        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2420        break;
2421    case INDEX_op_movcond_i32:
2422        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2423        break;
2424
2425    OP_32_64(bswap16):
2426        tcg_out_rolw_8(s, a0);
2427        break;
2428    OP_32_64(bswap32):
2429        tcg_out_bswap32(s, a0);
2430        break;
2431
2432    OP_32_64(neg):
2433        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2434        break;
2435    OP_32_64(not):
2436        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2437        break;
2438
2439    OP_32_64(ext8s):
2440        tcg_out_ext8s(s, a0, a1, rexw);
2441        break;
2442    OP_32_64(ext16s):
2443        tcg_out_ext16s(s, a0, a1, rexw);
2444        break;
2445    OP_32_64(ext8u):
2446        tcg_out_ext8u(s, a0, a1);
2447        break;
2448    OP_32_64(ext16u):
2449        tcg_out_ext16u(s, a0, a1);
2450        break;
2451
2452    case INDEX_op_qemu_ld_i32:
2453        tcg_out_qemu_ld(s, args, 0);
2454        break;
2455    case INDEX_op_qemu_ld_i64:
2456        tcg_out_qemu_ld(s, args, 1);
2457        break;
2458    case INDEX_op_qemu_st_i32:
2459    case INDEX_op_qemu_st8_i32:
2460        tcg_out_qemu_st(s, args, 0);
2461        break;
2462    case INDEX_op_qemu_st_i64:
2463        tcg_out_qemu_st(s, args, 1);
2464        break;
2465
2466    OP_32_64(mulu2):
2467        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2468        break;
2469    OP_32_64(muls2):
2470        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2471        break;
2472    OP_32_64(add2):
2473        if (const_args[4]) {
2474            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2475        } else {
2476            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2477        }
2478        if (const_args[5]) {
2479            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2480        } else {
2481            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2482        }
2483        break;
2484    OP_32_64(sub2):
2485        if (const_args[4]) {
2486            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2487        } else {
2488            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2489        }
2490        if (const_args[5]) {
2491            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2492        } else {
2493            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2494        }
2495        break;
2496
2497#if TCG_TARGET_REG_BITS == 32
2498    case INDEX_op_brcond2_i32:
2499        tcg_out_brcond2(s, args, const_args, 0);
2500        break;
2501    case INDEX_op_setcond2_i32:
2502        tcg_out_setcond2(s, args, const_args);
2503        break;
2504#else /* TCG_TARGET_REG_BITS == 64 */
2505    case INDEX_op_ld32s_i64:
2506        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2507        break;
2508    case INDEX_op_ld_i64:
2509        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2510        break;
2511    case INDEX_op_st_i64:
2512        if (const_args[0]) {
2513            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2514            tcg_out32(s, a0);
2515        } else {
2516            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2517        }
2518        break;
2519
2520    case INDEX_op_brcond_i64:
2521        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2522        break;
2523    case INDEX_op_setcond_i64:
2524        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2525        break;
2526    case INDEX_op_movcond_i64:
2527        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2528        break;
2529
2530    case INDEX_op_bswap64_i64:
2531        tcg_out_bswap64(s, a0);
2532        break;
2533    case INDEX_op_extu_i32_i64:
2534    case INDEX_op_ext32u_i64:
2535    case INDEX_op_extrl_i64_i32:
2536        tcg_out_ext32u(s, a0, a1);
2537        break;
2538    case INDEX_op_ext_i32_i64:
2539    case INDEX_op_ext32s_i64:
2540        tcg_out_ext32s(s, a0, a1);
2541        break;
2542    case INDEX_op_extrh_i64_i32:
2543        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2544        break;
2545#endif
2546
2547    OP_32_64(deposit):
2548        if (args[3] == 0 && args[4] == 8) {
2549            /* load bits 0..7 */
2550            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2551        } else if (args[3] == 8 && args[4] == 8) {
2552            /* load bits 8..15 */
2553            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2554        } else if (args[3] == 0 && args[4] == 16) {
2555            /* load bits 0..15 */
2556            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2557        } else {
2558            tcg_abort();
2559        }
2560        break;
2561
2562    case INDEX_op_extract_i64:
2563        if (a2 + args[3] == 32) {
2564            /* This is a 32-bit zero-extending right shift.  */
2565            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2566            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2567            break;
2568        }
2569        /* FALLTHRU */
2570    case INDEX_op_extract_i32:
2571        /* On the off-chance that we can use the high-byte registers.
2572           Otherwise we emit the same ext16 + shift pattern that we
2573           would have gotten from the normal tcg-op.c expansion.  */
2574        tcg_debug_assert(a2 == 8 && args[3] == 8);
2575        if (a1 < 4 && a0 < 8) {
2576            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2577        } else {
2578            tcg_out_ext16u(s, a0, a1);
2579            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2580        }
2581        break;
2582
2583    case INDEX_op_sextract_i32:
2584        /* We don't implement sextract_i64, as we cannot sign-extend to
2585           64-bits without using the REX prefix that explicitly excludes
2586           access to the high-byte registers.  */
2587        tcg_debug_assert(a2 == 8 && args[3] == 8);
2588        if (a1 < 4 && a0 < 8) {
2589            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2590        } else {
2591            tcg_out_ext16s(s, a0, a1, 0);
2592            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2593        }
2594        break;
2595
2596    OP_32_64(extract2):
2597        /* Note that SHRD outputs to the r/m operand.  */
2598        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2599        tcg_out8(s, args[3]);
2600        break;
2601
2602    case INDEX_op_mb:
2603        tcg_out_mb(s, a0);
2604        break;
2605    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2606    case INDEX_op_mov_i64:
2607    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2608    default:
2609        tcg_abort();
2610    }
2611
2612#undef OP_32_64
2613}
2614
2615static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2616                           unsigned vecl, unsigned vece,
2617                           const TCGArg args[TCG_MAX_OP_ARGS],
2618                           const int const_args[TCG_MAX_OP_ARGS])
2619{
2620    static int const add_insn[4] = {
2621        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2622    };
2623    static int const ssadd_insn[4] = {
2624        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2625    };
2626    static int const usadd_insn[4] = {
2627        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2628    };
2629    static int const sub_insn[4] = {
2630        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2631    };
2632    static int const sssub_insn[4] = {
2633        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2634    };
2635    static int const ussub_insn[4] = {
2636        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2637    };
2638    static int const mul_insn[4] = {
2639        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2640    };
2641    static int const shift_imm_insn[4] = {
2642        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2643    };
2644    static int const cmpeq_insn[4] = {
2645        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2646    };
2647    static int const cmpgt_insn[4] = {
2648        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2649    };
2650    static int const punpckl_insn[4] = {
2651        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2652    };
2653    static int const punpckh_insn[4] = {
2654        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2655    };
2656    static int const packss_insn[4] = {
2657        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2658    };
2659    static int const packus_insn[4] = {
2660        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2661    };
2662    static int const smin_insn[4] = {
2663        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2664    };
2665    static int const smax_insn[4] = {
2666        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2667    };
2668    static int const umin_insn[4] = {
2669        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2670    };
2671    static int const umax_insn[4] = {
2672        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2673    };
2674    static int const shlv_insn[4] = {
2675        /* TODO: AVX512 adds support for MO_16.  */
2676        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2677    };
2678    static int const shrv_insn[4] = {
2679        /* TODO: AVX512 adds support for MO_16.  */
2680        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2681    };
2682    static int const sarv_insn[4] = {
2683        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2684        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2685    };
2686    static int const shls_insn[4] = {
2687        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2688    };
2689    static int const shrs_insn[4] = {
2690        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2691    };
2692    static int const sars_insn[4] = {
2693        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2694    };
2695    static int const abs_insn[4] = {
2696        /* TODO: AVX512 adds support for MO_64.  */
2697        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2698    };
2699
2700    TCGType type = vecl + TCG_TYPE_V64;
2701    int insn, sub;
2702    TCGArg a0, a1, a2;
2703
2704    a0 = args[0];
2705    a1 = args[1];
2706    a2 = args[2];
2707
2708    switch (opc) {
2709    case INDEX_op_add_vec:
2710        insn = add_insn[vece];
2711        goto gen_simd;
2712    case INDEX_op_ssadd_vec:
2713        insn = ssadd_insn[vece];
2714        goto gen_simd;
2715    case INDEX_op_usadd_vec:
2716        insn = usadd_insn[vece];
2717        goto gen_simd;
2718    case INDEX_op_sub_vec:
2719        insn = sub_insn[vece];
2720        goto gen_simd;
2721    case INDEX_op_sssub_vec:
2722        insn = sssub_insn[vece];
2723        goto gen_simd;
2724    case INDEX_op_ussub_vec:
2725        insn = ussub_insn[vece];
2726        goto gen_simd;
2727    case INDEX_op_mul_vec:
2728        insn = mul_insn[vece];
2729        goto gen_simd;
2730    case INDEX_op_and_vec:
2731        insn = OPC_PAND;
2732        goto gen_simd;
2733    case INDEX_op_or_vec:
2734        insn = OPC_POR;
2735        goto gen_simd;
2736    case INDEX_op_xor_vec:
2737        insn = OPC_PXOR;
2738        goto gen_simd;
2739    case INDEX_op_smin_vec:
2740        insn = smin_insn[vece];
2741        goto gen_simd;
2742    case INDEX_op_umin_vec:
2743        insn = umin_insn[vece];
2744        goto gen_simd;
2745    case INDEX_op_smax_vec:
2746        insn = smax_insn[vece];
2747        goto gen_simd;
2748    case INDEX_op_umax_vec:
2749        insn = umax_insn[vece];
2750        goto gen_simd;
2751    case INDEX_op_shlv_vec:
2752        insn = shlv_insn[vece];
2753        goto gen_simd;
2754    case INDEX_op_shrv_vec:
2755        insn = shrv_insn[vece];
2756        goto gen_simd;
2757    case INDEX_op_sarv_vec:
2758        insn = sarv_insn[vece];
2759        goto gen_simd;
2760    case INDEX_op_shls_vec:
2761        insn = shls_insn[vece];
2762        goto gen_simd;
2763    case INDEX_op_shrs_vec:
2764        insn = shrs_insn[vece];
2765        goto gen_simd;
2766    case INDEX_op_sars_vec:
2767        insn = sars_insn[vece];
2768        goto gen_simd;
2769    case INDEX_op_x86_punpckl_vec:
2770        insn = punpckl_insn[vece];
2771        goto gen_simd;
2772    case INDEX_op_x86_punpckh_vec:
2773        insn = punpckh_insn[vece];
2774        goto gen_simd;
2775    case INDEX_op_x86_packss_vec:
2776        insn = packss_insn[vece];
2777        goto gen_simd;
2778    case INDEX_op_x86_packus_vec:
2779        insn = packus_insn[vece];
2780        goto gen_simd;
2781#if TCG_TARGET_REG_BITS == 32
2782    case INDEX_op_dup2_vec:
2783        /* First merge the two 32-bit inputs to a single 64-bit element. */
2784        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2785        /* Then replicate the 64-bit elements across the rest of the vector. */
2786        if (type != TCG_TYPE_V64) {
2787            tcg_out_dup_vec(s, type, MO_64, a0, a0);
2788        }
2789        break;
2790#endif
2791    case INDEX_op_abs_vec:
2792        insn = abs_insn[vece];
2793        a2 = a1;
2794        a1 = 0;
2795        goto gen_simd;
2796    gen_simd:
2797        tcg_debug_assert(insn != OPC_UD2);
2798        if (type == TCG_TYPE_V256) {
2799            insn |= P_VEXL;
2800        }
2801        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2802        break;
2803
2804    case INDEX_op_cmp_vec:
2805        sub = args[3];
2806        if (sub == TCG_COND_EQ) {
2807            insn = cmpeq_insn[vece];
2808        } else if (sub == TCG_COND_GT) {
2809            insn = cmpgt_insn[vece];
2810        } else {
2811            g_assert_not_reached();
2812        }
2813        goto gen_simd;
2814
2815    case INDEX_op_andc_vec:
2816        insn = OPC_PANDN;
2817        if (type == TCG_TYPE_V256) {
2818            insn |= P_VEXL;
2819        }
2820        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2821        break;
2822
2823    case INDEX_op_shli_vec:
2824        sub = 6;
2825        goto gen_shift;
2826    case INDEX_op_shri_vec:
2827        sub = 2;
2828        goto gen_shift;
2829    case INDEX_op_sari_vec:
2830        tcg_debug_assert(vece != MO_64);
2831        sub = 4;
2832    gen_shift:
2833        tcg_debug_assert(vece != MO_8);
2834        insn = shift_imm_insn[vece];
2835        if (type == TCG_TYPE_V256) {
2836            insn |= P_VEXL;
2837        }
2838        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2839        tcg_out8(s, a2);
2840        break;
2841
2842    case INDEX_op_ld_vec:
2843        tcg_out_ld(s, type, a0, a1, a2);
2844        break;
2845    case INDEX_op_st_vec:
2846        tcg_out_st(s, type, a0, a1, a2);
2847        break;
2848    case INDEX_op_dupm_vec:
2849        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2850        break;
2851
2852    case INDEX_op_x86_shufps_vec:
2853        insn = OPC_SHUFPS;
2854        sub = args[3];
2855        goto gen_simd_imm8;
2856    case INDEX_op_x86_blend_vec:
2857        if (vece == MO_16) {
2858            insn = OPC_PBLENDW;
2859        } else if (vece == MO_32) {
2860            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2861        } else {
2862            g_assert_not_reached();
2863        }
2864        sub = args[3];
2865        goto gen_simd_imm8;
2866    case INDEX_op_x86_vperm2i128_vec:
2867        insn = OPC_VPERM2I128;
2868        sub = args[3];
2869        goto gen_simd_imm8;
2870    gen_simd_imm8:
2871        if (type == TCG_TYPE_V256) {
2872            insn |= P_VEXL;
2873        }
2874        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2875        tcg_out8(s, sub);
2876        break;
2877
2878    case INDEX_op_x86_vpblendvb_vec:
2879        insn = OPC_VPBLENDVB;
2880        if (type == TCG_TYPE_V256) {
2881            insn |= P_VEXL;
2882        }
2883        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2884        tcg_out8(s, args[3] << 4);
2885        break;
2886
2887    case INDEX_op_x86_psrldq_vec:
2888        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2889        tcg_out8(s, a2);
2890        break;
2891
2892    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2893    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2894    default:
2895        g_assert_not_reached();
2896    }
2897}
2898
2899static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2900{
2901    switch (op) {
2902    case INDEX_op_goto_ptr:
2903        return C_O0_I1(r);
2904
2905    case INDEX_op_ld8u_i32:
2906    case INDEX_op_ld8u_i64:
2907    case INDEX_op_ld8s_i32:
2908    case INDEX_op_ld8s_i64:
2909    case INDEX_op_ld16u_i32:
2910    case INDEX_op_ld16u_i64:
2911    case INDEX_op_ld16s_i32:
2912    case INDEX_op_ld16s_i64:
2913    case INDEX_op_ld_i32:
2914    case INDEX_op_ld32u_i64:
2915    case INDEX_op_ld32s_i64:
2916    case INDEX_op_ld_i64:
2917        return C_O1_I1(r, r);
2918
2919    case INDEX_op_st8_i32:
2920    case INDEX_op_st8_i64:
2921        return C_O0_I2(qi, r);
2922
2923    case INDEX_op_st16_i32:
2924    case INDEX_op_st16_i64:
2925    case INDEX_op_st_i32:
2926    case INDEX_op_st32_i64:
2927        return C_O0_I2(ri, r);
2928
2929    case INDEX_op_st_i64:
2930        return C_O0_I2(re, r);
2931
2932    case INDEX_op_add_i32:
2933    case INDEX_op_add_i64:
2934        return C_O1_I2(r, r, re);
2935
2936    case INDEX_op_sub_i32:
2937    case INDEX_op_sub_i64:
2938    case INDEX_op_mul_i32:
2939    case INDEX_op_mul_i64:
2940    case INDEX_op_or_i32:
2941    case INDEX_op_or_i64:
2942    case INDEX_op_xor_i32:
2943    case INDEX_op_xor_i64:
2944        return C_O1_I2(r, 0, re);
2945
2946    case INDEX_op_and_i32:
2947    case INDEX_op_and_i64:
2948        return C_O1_I2(r, 0, reZ);
2949
2950    case INDEX_op_andc_i32:
2951    case INDEX_op_andc_i64:
2952        return C_O1_I2(r, r, rI);
2953
2954    case INDEX_op_shl_i32:
2955    case INDEX_op_shl_i64:
2956    case INDEX_op_shr_i32:
2957    case INDEX_op_shr_i64:
2958    case INDEX_op_sar_i32:
2959    case INDEX_op_sar_i64:
2960        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2961
2962    case INDEX_op_rotl_i32:
2963    case INDEX_op_rotl_i64:
2964    case INDEX_op_rotr_i32:
2965    case INDEX_op_rotr_i64:
2966        return C_O1_I2(r, 0, ci);
2967
2968    case INDEX_op_brcond_i32:
2969    case INDEX_op_brcond_i64:
2970        return C_O0_I2(r, re);
2971
2972    case INDEX_op_bswap16_i32:
2973    case INDEX_op_bswap16_i64:
2974    case INDEX_op_bswap32_i32:
2975    case INDEX_op_bswap32_i64:
2976    case INDEX_op_bswap64_i64:
2977    case INDEX_op_neg_i32:
2978    case INDEX_op_neg_i64:
2979    case INDEX_op_not_i32:
2980    case INDEX_op_not_i64:
2981    case INDEX_op_extrh_i64_i32:
2982        return C_O1_I1(r, 0);
2983
2984    case INDEX_op_ext8s_i32:
2985    case INDEX_op_ext8s_i64:
2986    case INDEX_op_ext8u_i32:
2987    case INDEX_op_ext8u_i64:
2988        return C_O1_I1(r, q);
2989
2990    case INDEX_op_ext16s_i32:
2991    case INDEX_op_ext16s_i64:
2992    case INDEX_op_ext16u_i32:
2993    case INDEX_op_ext16u_i64:
2994    case INDEX_op_ext32s_i64:
2995    case INDEX_op_ext32u_i64:
2996    case INDEX_op_ext_i32_i64:
2997    case INDEX_op_extu_i32_i64:
2998    case INDEX_op_extrl_i64_i32:
2999    case INDEX_op_extract_i32:
3000    case INDEX_op_extract_i64:
3001    case INDEX_op_sextract_i32:
3002    case INDEX_op_ctpop_i32:
3003    case INDEX_op_ctpop_i64:
3004        return C_O1_I1(r, r);
3005
3006    case INDEX_op_extract2_i32:
3007    case INDEX_op_extract2_i64:
3008        return C_O1_I2(r, 0, r);
3009
3010    case INDEX_op_deposit_i32:
3011    case INDEX_op_deposit_i64:
3012        return C_O1_I2(Q, 0, Q);
3013
3014    case INDEX_op_setcond_i32:
3015    case INDEX_op_setcond_i64:
3016        return C_O1_I2(q, r, re);
3017
3018    case INDEX_op_movcond_i32:
3019    case INDEX_op_movcond_i64:
3020        return C_O1_I4(r, r, re, r, 0);
3021
3022    case INDEX_op_div2_i32:
3023    case INDEX_op_div2_i64:
3024    case INDEX_op_divu2_i32:
3025    case INDEX_op_divu2_i64:
3026        return C_O2_I3(a, d, 0, 1, r);
3027
3028    case INDEX_op_mulu2_i32:
3029    case INDEX_op_mulu2_i64:
3030    case INDEX_op_muls2_i32:
3031    case INDEX_op_muls2_i64:
3032        return C_O2_I2(a, d, a, r);
3033
3034    case INDEX_op_add2_i32:
3035    case INDEX_op_add2_i64:
3036    case INDEX_op_sub2_i32:
3037    case INDEX_op_sub2_i64:
3038        return C_O2_I4(r, r, 0, 1, re, re);
3039
3040    case INDEX_op_ctz_i32:
3041    case INDEX_op_ctz_i64:
3042        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3043
3044    case INDEX_op_clz_i32:
3045    case INDEX_op_clz_i64:
3046        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3047
3048    case INDEX_op_qemu_ld_i32:
3049        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3050                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3051
3052    case INDEX_op_qemu_st_i32:
3053        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3054                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3055    case INDEX_op_qemu_st8_i32:
3056        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3057                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3058
3059    case INDEX_op_qemu_ld_i64:
3060        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3061                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3062                : C_O2_I2(r, r, L, L));
3063
3064    case INDEX_op_qemu_st_i64:
3065        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3066                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3067                : C_O0_I4(L, L, L, L));
3068
3069    case INDEX_op_brcond2_i32:
3070        return C_O0_I4(r, r, ri, ri);
3071
3072    case INDEX_op_setcond2_i32:
3073        return C_O1_I4(r, r, r, ri, ri);
3074
3075    case INDEX_op_ld_vec:
3076    case INDEX_op_dupm_vec:
3077        return C_O1_I1(x, r);
3078
3079    case INDEX_op_st_vec:
3080        return C_O0_I2(x, r);
3081
3082    case INDEX_op_add_vec:
3083    case INDEX_op_sub_vec:
3084    case INDEX_op_mul_vec:
3085    case INDEX_op_and_vec:
3086    case INDEX_op_or_vec:
3087    case INDEX_op_xor_vec:
3088    case INDEX_op_andc_vec:
3089    case INDEX_op_ssadd_vec:
3090    case INDEX_op_usadd_vec:
3091    case INDEX_op_sssub_vec:
3092    case INDEX_op_ussub_vec:
3093    case INDEX_op_smin_vec:
3094    case INDEX_op_umin_vec:
3095    case INDEX_op_smax_vec:
3096    case INDEX_op_umax_vec:
3097    case INDEX_op_shlv_vec:
3098    case INDEX_op_shrv_vec:
3099    case INDEX_op_sarv_vec:
3100    case INDEX_op_shls_vec:
3101    case INDEX_op_shrs_vec:
3102    case INDEX_op_sars_vec:
3103    case INDEX_op_rotls_vec:
3104    case INDEX_op_cmp_vec:
3105    case INDEX_op_x86_shufps_vec:
3106    case INDEX_op_x86_blend_vec:
3107    case INDEX_op_x86_packss_vec:
3108    case INDEX_op_x86_packus_vec:
3109    case INDEX_op_x86_vperm2i128_vec:
3110    case INDEX_op_x86_punpckl_vec:
3111    case INDEX_op_x86_punpckh_vec:
3112#if TCG_TARGET_REG_BITS == 32
3113    case INDEX_op_dup2_vec:
3114#endif
3115        return C_O1_I2(x, x, x);
3116
3117    case INDEX_op_abs_vec:
3118    case INDEX_op_dup_vec:
3119    case INDEX_op_shli_vec:
3120    case INDEX_op_shri_vec:
3121    case INDEX_op_sari_vec:
3122    case INDEX_op_x86_psrldq_vec:
3123        return C_O1_I1(x, x);
3124
3125    case INDEX_op_x86_vpblendvb_vec:
3126        return C_O1_I3(x, x, x, x);
3127
3128    default:
3129        g_assert_not_reached();
3130    }
3131}
3132
3133int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3134{
3135    switch (opc) {
3136    case INDEX_op_add_vec:
3137    case INDEX_op_sub_vec:
3138    case INDEX_op_and_vec:
3139    case INDEX_op_or_vec:
3140    case INDEX_op_xor_vec:
3141    case INDEX_op_andc_vec:
3142        return 1;
3143    case INDEX_op_rotli_vec:
3144    case INDEX_op_cmp_vec:
3145    case INDEX_op_cmpsel_vec:
3146        return -1;
3147
3148    case INDEX_op_shli_vec:
3149    case INDEX_op_shri_vec:
3150        /* We must expand the operation for MO_8.  */
3151        return vece == MO_8 ? -1 : 1;
3152
3153    case INDEX_op_sari_vec:
3154        /* We must expand the operation for MO_8.  */
3155        if (vece == MO_8) {
3156            return -1;
3157        }
3158        /* We can emulate this for MO_64, but it does not pay off
3159           unless we're producing at least 4 values.  */
3160        if (vece == MO_64) {
3161            return type >= TCG_TYPE_V256 ? -1 : 0;
3162        }
3163        return 1;
3164
3165    case INDEX_op_shls_vec:
3166    case INDEX_op_shrs_vec:
3167        return vece >= MO_16;
3168    case INDEX_op_sars_vec:
3169        return vece >= MO_16 && vece <= MO_32;
3170    case INDEX_op_rotls_vec:
3171        return vece >= MO_16 ? -1 : 0;
3172
3173    case INDEX_op_shlv_vec:
3174    case INDEX_op_shrv_vec:
3175        return have_avx2 && vece >= MO_32;
3176    case INDEX_op_sarv_vec:
3177        return have_avx2 && vece == MO_32;
3178    case INDEX_op_rotlv_vec:
3179    case INDEX_op_rotrv_vec:
3180        return have_avx2 && vece >= MO_32 ? -1 : 0;
3181
3182    case INDEX_op_mul_vec:
3183        if (vece == MO_8) {
3184            /* We can expand the operation for MO_8.  */
3185            return -1;
3186        }
3187        if (vece == MO_64) {
3188            return 0;
3189        }
3190        return 1;
3191
3192    case INDEX_op_ssadd_vec:
3193    case INDEX_op_usadd_vec:
3194    case INDEX_op_sssub_vec:
3195    case INDEX_op_ussub_vec:
3196        return vece <= MO_16;
3197    case INDEX_op_smin_vec:
3198    case INDEX_op_smax_vec:
3199    case INDEX_op_umin_vec:
3200    case INDEX_op_umax_vec:
3201    case INDEX_op_abs_vec:
3202        return vece <= MO_32;
3203
3204    default:
3205        return 0;
3206    }
3207}
3208
3209static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3210                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3211{
3212    TCGv_vec t1, t2;
3213
3214    tcg_debug_assert(vece == MO_8);
3215
3216    t1 = tcg_temp_new_vec(type);
3217    t2 = tcg_temp_new_vec(type);
3218
3219    /*
3220     * Unpack to W, shift, and repack.  Tricky bits:
3221     * (1) Use punpck*bw x,x to produce DDCCBBAA,
3222     *     i.e. duplicate in other half of the 16-bit lane.
3223     * (2) For right-shift, add 8 so that the high half of the lane
3224     *     becomes zero.  For left-shift, and left-rotate, we must
3225     *     shift up and down again.
3226     * (3) Step 2 leaves high half zero such that PACKUSWB
3227     *     (pack with unsigned saturation) does not modify
3228     *     the quantity.
3229     */
3230    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3231              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3232    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3233              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3234
3235    if (opc != INDEX_op_rotli_vec) {
3236        imm += 8;
3237    }
3238    if (opc == INDEX_op_shri_vec) {
3239        tcg_gen_shri_vec(MO_16, t1, t1, imm);
3240        tcg_gen_shri_vec(MO_16, t2, t2, imm);
3241    } else {
3242        tcg_gen_shli_vec(MO_16, t1, t1, imm);
3243        tcg_gen_shli_vec(MO_16, t2, t2, imm);
3244        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3245        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3246    }
3247
3248    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3249              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3250    tcg_temp_free_vec(t1);
3251    tcg_temp_free_vec(t2);
3252}
3253
3254static void expand_vec_sari(TCGType type, unsigned vece,
3255                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3256{
3257    TCGv_vec t1, t2;
3258
3259    switch (vece) {
3260    case MO_8:
3261        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3262        t1 = tcg_temp_new_vec(type);
3263        t2 = tcg_temp_new_vec(type);
3264        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3265                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3266        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3267                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3268        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3269        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3270        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3271                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3272        tcg_temp_free_vec(t1);
3273        tcg_temp_free_vec(t2);
3274        break;
3275
3276    case MO_64:
3277        if (imm <= 32) {
3278            /*
3279             * We can emulate a small sign extend by performing an arithmetic
3280             * 32-bit shift and overwriting the high half of a 64-bit logical
3281             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3282             * does not, so we have to bound the smaller shift -- we get the
3283             * same result in the high half either way.
3284             */
3285            t1 = tcg_temp_new_vec(type);
3286            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3287            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3288            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3289                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3290                      tcgv_vec_arg(t1), 0xaa);
3291            tcg_temp_free_vec(t1);
3292        } else {
3293            /* Otherwise we will need to use a compare vs 0 to produce
3294             * the sign-extend, shift and merge.
3295             */
3296            t1 = tcg_const_zeros_vec(type);
3297            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3298            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3299            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3300            tcg_gen_or_vec(MO_64, v0, v0, t1);
3301            tcg_temp_free_vec(t1);
3302        }
3303        break;
3304
3305    default:
3306        g_assert_not_reached();
3307    }
3308}
3309
3310static void expand_vec_rotli(TCGType type, unsigned vece,
3311                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3312{
3313    TCGv_vec t;
3314
3315    if (vece == MO_8) {
3316        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3317        return;
3318    }
3319
3320    t = tcg_temp_new_vec(type);
3321    tcg_gen_shli_vec(vece, t, v1, imm);
3322    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3323    tcg_gen_or_vec(vece, v0, v0, t);
3324    tcg_temp_free_vec(t);
3325}
3326
3327static void expand_vec_rotls(TCGType type, unsigned vece,
3328                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3329{
3330    TCGv_i32 rsh;
3331    TCGv_vec t;
3332
3333    tcg_debug_assert(vece != MO_8);
3334
3335    t = tcg_temp_new_vec(type);
3336    rsh = tcg_temp_new_i32();
3337
3338    tcg_gen_neg_i32(rsh, lsh);
3339    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3340    tcg_gen_shls_vec(vece, t, v1, lsh);
3341    tcg_gen_shrs_vec(vece, v0, v1, rsh);
3342    tcg_gen_or_vec(vece, v0, v0, t);
3343    tcg_temp_free_vec(t);
3344    tcg_temp_free_i32(rsh);
3345}
3346
3347static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3348                            TCGv_vec v1, TCGv_vec sh, bool right)
3349{
3350    TCGv_vec t = tcg_temp_new_vec(type);
3351
3352    tcg_gen_dupi_vec(vece, t, 8 << vece);
3353    tcg_gen_sub_vec(vece, t, t, sh);
3354    if (right) {
3355        tcg_gen_shlv_vec(vece, t, v1, t);
3356        tcg_gen_shrv_vec(vece, v0, v1, sh);
3357    } else {
3358        tcg_gen_shrv_vec(vece, t, v1, t);
3359        tcg_gen_shlv_vec(vece, v0, v1, sh);
3360    }
3361    tcg_gen_or_vec(vece, v0, v0, t);
3362    tcg_temp_free_vec(t);
3363}
3364
3365static void expand_vec_mul(TCGType type, unsigned vece,
3366                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3367{
3368    TCGv_vec t1, t2, t3, t4, zero;
3369
3370    tcg_debug_assert(vece == MO_8);
3371
3372    /*
3373     * Unpack v1 bytes to words, 0 | x.
3374     * Unpack v2 bytes to words, y | 0.
3375     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3376     * Shift logical right by 8 bits to clear the high 8 bytes before
3377     * using an unsigned saturated pack.
3378     *
3379     * The difference between the V64, V128 and V256 cases is merely how
3380     * we distribute the expansion between temporaries.
3381     */
3382    switch (type) {
3383    case TCG_TYPE_V64:
3384        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3385        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3386        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3387        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3388                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3389        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3390                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3391        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3392        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3393        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3394                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3395        tcg_temp_free_vec(t1);
3396        tcg_temp_free_vec(t2);
3397        break;
3398
3399    case TCG_TYPE_V128:
3400    case TCG_TYPE_V256:
3401        t1 = tcg_temp_new_vec(type);
3402        t2 = tcg_temp_new_vec(type);
3403        t3 = tcg_temp_new_vec(type);
3404        t4 = tcg_temp_new_vec(type);
3405        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3406        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3407                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3408        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3409                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3410        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3411                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3412        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3413                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3414        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3415        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3416        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3417        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3418        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3419                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3420        tcg_temp_free_vec(t1);
3421        tcg_temp_free_vec(t2);
3422        tcg_temp_free_vec(t3);
3423        tcg_temp_free_vec(t4);
3424        break;
3425
3426    default:
3427        g_assert_not_reached();
3428    }
3429}
3430
3431static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3432                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3433{
3434    enum {
3435        NEED_INV  = 1,
3436        NEED_SWAP = 2,
3437        NEED_BIAS = 4,
3438        NEED_UMIN = 8,
3439        NEED_UMAX = 16,
3440    };
3441    TCGv_vec t1, t2, t3;
3442    uint8_t fixup;
3443
3444    switch (cond) {
3445    case TCG_COND_EQ:
3446    case TCG_COND_GT:
3447        fixup = 0;
3448        break;
3449    case TCG_COND_NE:
3450    case TCG_COND_LE:
3451        fixup = NEED_INV;
3452        break;
3453    case TCG_COND_LT:
3454        fixup = NEED_SWAP;
3455        break;
3456    case TCG_COND_GE:
3457        fixup = NEED_SWAP | NEED_INV;
3458        break;
3459    case TCG_COND_LEU:
3460        if (vece <= MO_32) {
3461            fixup = NEED_UMIN;
3462        } else {
3463            fixup = NEED_BIAS | NEED_INV;
3464        }
3465        break;
3466    case TCG_COND_GTU:
3467        if (vece <= MO_32) {
3468            fixup = NEED_UMIN | NEED_INV;
3469        } else {
3470            fixup = NEED_BIAS;
3471        }
3472        break;
3473    case TCG_COND_GEU:
3474        if (vece <= MO_32) {
3475            fixup = NEED_UMAX;
3476        } else {
3477            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3478        }
3479        break;
3480    case TCG_COND_LTU:
3481        if (vece <= MO_32) {
3482            fixup = NEED_UMAX | NEED_INV;
3483        } else {
3484            fixup = NEED_BIAS | NEED_SWAP;
3485        }
3486        break;
3487    default:
3488        g_assert_not_reached();
3489    }
3490
3491    if (fixup & NEED_INV) {
3492        cond = tcg_invert_cond(cond);
3493    }
3494    if (fixup & NEED_SWAP) {
3495        t1 = v1, v1 = v2, v2 = t1;
3496        cond = tcg_swap_cond(cond);
3497    }
3498
3499    t1 = t2 = NULL;
3500    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3501        t1 = tcg_temp_new_vec(type);
3502        if (fixup & NEED_UMIN) {
3503            tcg_gen_umin_vec(vece, t1, v1, v2);
3504        } else {
3505            tcg_gen_umax_vec(vece, t1, v1, v2);
3506        }
3507        v2 = t1;
3508        cond = TCG_COND_EQ;
3509    } else if (fixup & NEED_BIAS) {
3510        t1 = tcg_temp_new_vec(type);
3511        t2 = tcg_temp_new_vec(type);
3512        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3513        tcg_gen_sub_vec(vece, t1, v1, t3);
3514        tcg_gen_sub_vec(vece, t2, v2, t3);
3515        v1 = t1;
3516        v2 = t2;
3517        cond = tcg_signed_cond(cond);
3518    }
3519
3520    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3521    /* Expand directly; do not recurse.  */
3522    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3523              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3524
3525    if (t1) {
3526        tcg_temp_free_vec(t1);
3527        if (t2) {
3528            tcg_temp_free_vec(t2);
3529        }
3530    }
3531    return fixup & NEED_INV;
3532}
3533
3534static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3535                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3536{
3537    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3538        tcg_gen_not_vec(vece, v0, v0);
3539    }
3540}
3541
3542static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3543                              TCGv_vec c1, TCGv_vec c2,
3544                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3545{
3546    TCGv_vec t = tcg_temp_new_vec(type);
3547
3548    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3549        /* Invert the sense of the compare by swapping arguments.  */
3550        TCGv_vec x;
3551        x = v3, v3 = v4, v4 = x;
3552    }
3553    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3554              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3555              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3556    tcg_temp_free_vec(t);
3557}
3558
3559void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3560                       TCGArg a0, ...)
3561{
3562    va_list va;
3563    TCGArg a2;
3564    TCGv_vec v0, v1, v2, v3, v4;
3565
3566    va_start(va, a0);
3567    v0 = temp_tcgv_vec(arg_temp(a0));
3568    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3569    a2 = va_arg(va, TCGArg);
3570
3571    switch (opc) {
3572    case INDEX_op_shli_vec:
3573    case INDEX_op_shri_vec:
3574        expand_vec_shi(type, vece, opc, v0, v1, a2);
3575        break;
3576
3577    case INDEX_op_sari_vec:
3578        expand_vec_sari(type, vece, v0, v1, a2);
3579        break;
3580
3581    case INDEX_op_rotli_vec:
3582        expand_vec_rotli(type, vece, v0, v1, a2);
3583        break;
3584
3585    case INDEX_op_rotls_vec:
3586        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3587        break;
3588
3589    case INDEX_op_rotlv_vec:
3590        v2 = temp_tcgv_vec(arg_temp(a2));
3591        expand_vec_rotv(type, vece, v0, v1, v2, false);
3592        break;
3593    case INDEX_op_rotrv_vec:
3594        v2 = temp_tcgv_vec(arg_temp(a2));
3595        expand_vec_rotv(type, vece, v0, v1, v2, true);
3596        break;
3597
3598    case INDEX_op_mul_vec:
3599        v2 = temp_tcgv_vec(arg_temp(a2));
3600        expand_vec_mul(type, vece, v0, v1, v2);
3601        break;
3602
3603    case INDEX_op_cmp_vec:
3604        v2 = temp_tcgv_vec(arg_temp(a2));
3605        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3606        break;
3607
3608    case INDEX_op_cmpsel_vec:
3609        v2 = temp_tcgv_vec(arg_temp(a2));
3610        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3611        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3612        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3613        break;
3614
3615    default:
3616        break;
3617    }
3618
3619    va_end(va);
3620}
3621
3622static const int tcg_target_callee_save_regs[] = {
3623#if TCG_TARGET_REG_BITS == 64
3624    TCG_REG_RBP,
3625    TCG_REG_RBX,
3626#if defined(_WIN64)
3627    TCG_REG_RDI,
3628    TCG_REG_RSI,
3629#endif
3630    TCG_REG_R12,
3631    TCG_REG_R13,
3632    TCG_REG_R14, /* Currently used for the global env. */
3633    TCG_REG_R15,
3634#else
3635    TCG_REG_EBP, /* Currently used for the global env. */
3636    TCG_REG_EBX,
3637    TCG_REG_ESI,
3638    TCG_REG_EDI,
3639#endif
3640};
3641
3642/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3643   and tcg_register_jit.  */
3644
3645#define PUSH_SIZE \
3646    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3647     * (TCG_TARGET_REG_BITS / 8))
3648
3649#define FRAME_SIZE \
3650    ((PUSH_SIZE \
3651      + TCG_STATIC_CALL_ARGS_SIZE \
3652      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3653      + TCG_TARGET_STACK_ALIGN - 1) \
3654     & ~(TCG_TARGET_STACK_ALIGN - 1))
3655
3656/* Generate global QEMU prologue and epilogue code */
3657static void tcg_target_qemu_prologue(TCGContext *s)
3658{
3659    int i, stack_addend;
3660
3661    /* TB prologue */
3662
3663    /* Reserve some stack space, also for TCG temps.  */
3664    stack_addend = FRAME_SIZE - PUSH_SIZE;
3665    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3666                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3667
3668    /* Save all callee saved registers.  */
3669    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3670        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3671    }
3672
3673#if TCG_TARGET_REG_BITS == 32
3674    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3675               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3676    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3677    /* jmp *tb.  */
3678    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3679                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3680                         + stack_addend);
3681#else
3682# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3683    if (guest_base) {
3684        int seg = setup_guest_base_seg();
3685        if (seg != 0) {
3686            x86_guest_base_seg = seg;
3687        } else if (guest_base == (int32_t)guest_base) {
3688            x86_guest_base_offset = guest_base;
3689        } else {
3690            /* Choose R12 because, as a base, it requires a SIB byte. */
3691            x86_guest_base_index = TCG_REG_R12;
3692            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3693            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3694        }
3695    }
3696# endif
3697    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3698    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3699    /* jmp *tb.  */
3700    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3701#endif
3702
3703    /*
3704     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3705     * and fall through to the rest of the epilogue.
3706     */
3707    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3708    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3709
3710    /* TB epilogue */
3711    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3712
3713    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3714
3715    if (have_avx2) {
3716        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3717    }
3718    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3719        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3720    }
3721    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3722}
3723
3724static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3725{
3726    memset(p, 0x90, count);
3727}
3728
3729static void tcg_target_init(TCGContext *s)
3730{
3731#ifdef CONFIG_CPUID_H
3732    unsigned a, b, c, d, b7 = 0;
3733    int max = __get_cpuid_max(0, 0);
3734
3735    if (max >= 7) {
3736        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3737        __cpuid_count(7, 0, a, b7, c, d);
3738        have_bmi1 = (b7 & bit_BMI) != 0;
3739        have_bmi2 = (b7 & bit_BMI2) != 0;
3740    }
3741
3742    if (max >= 1) {
3743        __cpuid(1, a, b, c, d);
3744#ifndef have_cmov
3745        /* For 32-bit, 99% certainty that we're running on hardware that
3746           supports cmov, but we still need to check.  In case cmov is not
3747           available, we'll use a small forward branch.  */
3748        have_cmov = (d & bit_CMOV) != 0;
3749#endif
3750
3751        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3752           need to probe for it.  */
3753        have_movbe = (c & bit_MOVBE) != 0;
3754        have_popcnt = (c & bit_POPCNT) != 0;
3755
3756        /* There are a number of things we must check before we can be
3757           sure of not hitting invalid opcode.  */
3758        if (c & bit_OSXSAVE) {
3759            unsigned xcrl, xcrh;
3760            /* The xgetbv instruction is not available to older versions of
3761             * the assembler, so we encode the instruction manually.
3762             */
3763            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3764            if ((xcrl & 6) == 6) {
3765                have_avx1 = (c & bit_AVX) != 0;
3766                have_avx2 = (b7 & bit_AVX2) != 0;
3767            }
3768        }
3769    }
3770
3771    max = __get_cpuid_max(0x8000000, 0);
3772    if (max >= 1) {
3773        __cpuid(0x80000001, a, b, c, d);
3774        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3775        have_lzcnt = (c & bit_LZCNT) != 0;
3776    }
3777#endif /* CONFIG_CPUID_H */
3778
3779    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3780    if (TCG_TARGET_REG_BITS == 64) {
3781        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3782    }
3783    if (have_avx1) {
3784        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3785        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3786    }
3787    if (have_avx2) {
3788        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3789    }
3790
3791    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3792    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3793    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3794    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3795    if (TCG_TARGET_REG_BITS == 64) {
3796#if !defined(_WIN64)
3797        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3798        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3799#endif
3800        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3801        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3802        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3803        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3804    }
3805
3806    s->reserved_regs = 0;
3807    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3808}
3809
3810typedef struct {
3811    DebugFrameHeader h;
3812    uint8_t fde_def_cfa[4];
3813    uint8_t fde_reg_ofs[14];
3814} DebugFrame;
3815
3816/* We're expecting a 2 byte uleb128 encoded value.  */
3817QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3818
3819#if !defined(__ELF__)
3820    /* Host machine without ELF. */
3821#elif TCG_TARGET_REG_BITS == 64
3822#define ELF_HOST_MACHINE EM_X86_64
3823static const DebugFrame debug_frame = {
3824    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3825    .h.cie.id = -1,
3826    .h.cie.version = 1,
3827    .h.cie.code_align = 1,
3828    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3829    .h.cie.return_column = 16,
3830
3831    /* Total FDE size does not include the "len" member.  */
3832    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3833
3834    .fde_def_cfa = {
3835        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3836        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3837        (FRAME_SIZE >> 7)
3838    },
3839    .fde_reg_ofs = {
3840        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3841        /* The following ordering must match tcg_target_callee_save_regs.  */
3842        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3843        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3844        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3845        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3846        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3847        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3848    }
3849};
3850#else
3851#define ELF_HOST_MACHINE EM_386
3852static const DebugFrame debug_frame = {
3853    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3854    .h.cie.id = -1,
3855    .h.cie.version = 1,
3856    .h.cie.code_align = 1,
3857    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3858    .h.cie.return_column = 8,
3859
3860    /* Total FDE size does not include the "len" member.  */
3861    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3862
3863    .fde_def_cfa = {
3864        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3865        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3866        (FRAME_SIZE >> 7)
3867    },
3868    .fde_reg_ofs = {
3869        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3870        /* The following ordering must match tcg_target_callee_save_regs.  */
3871        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3872        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3873        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3874        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3875    }
3876};
3877#endif
3878
3879#if defined(ELF_HOST_MACHINE)
3880void tcg_register_jit(const void *buf, size_t buf_size)
3881{
3882    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3883}
3884#endif
3885