1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
sljit_get_platform_name(void)27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28 {
29 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
30 	return "x86" SLJIT_CPUINFO " ABI:fastcall";
31 #else
32 	return "x86" SLJIT_CPUINFO;
33 #endif
34 }
35 
36 /*
37    32b register indexes:
38      0 - EAX
39      1 - ECX
40      2 - EDX
41      3 - EBX
42      4 - ESP
43      5 - EBP
44      6 - ESI
45      7 - EDI
46 */
47 
48 /*
49    64b register indexes:
50      0 - RAX
51      1 - RCX
52      2 - RDX
53      3 - RBX
54      4 - RSP
55      5 - RBP
56      6 - RSI
57      7 - RDI
58      8 - R8   - From now on REX prefix is required
59      9 - R9
60     10 - R10
61     11 - R11
62     12 - R12
63     13 - R13
64     14 - R14
65     15 - R15
66 */
67 
68 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
69 
70 /* Last register + 1. */
71 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
72 
73 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
74 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
75 };
76 
77 #define CHECK_EXTRA_REGS(p, w, do) \
78 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
79 		if (p <= compiler->scratches) \
80 			w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \
81 		else \
82 			w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \
83 		p = SLJIT_MEM1(SLJIT_SP); \
84 		do; \
85 	}
86 
87 #else /* SLJIT_CONFIG_X86_32 */
88 
89 /* Last register + 1. */
90 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
91 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
92 
93 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
94    Note: avoid to use r12 and r13 for memory addessing
95    therefore r12 is better to be a higher saved register. */
96 #ifndef _WIN64
97 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
98 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
99 	0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
100 };
101 /* low-map. reg_map & 0x7. */
102 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
103 	0, 0, 6, 7, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
104 };
105 #else
106 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
107 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
108 	0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
109 };
110 /* low-map. reg_map & 0x7. */
111 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
112 	0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
113 };
114 #endif
115 
116 /* Args: xmm0-xmm3 */
117 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
118 	4, 0, 1, 2, 3, 5, 6
119 };
120 /* low-map. freg_map & 0x7. */
121 static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
122 	4, 0, 1, 2, 3, 5, 6
123 };
124 
125 #define REX_W		0x48
126 #define REX_R		0x44
127 #define REX_X		0x42
128 #define REX_B		0x41
129 #define REX		0x40
130 
131 #ifndef _WIN64
132 #define HALFWORD_MAX 0x7fffffffl
133 #define HALFWORD_MIN -0x80000000l
134 #else
135 #define HALFWORD_MAX 0x7fffffffll
136 #define HALFWORD_MIN -0x80000000ll
137 #endif
138 
139 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
140 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
141 
142 #define CHECK_EXTRA_REGS(p, w, do)
143 
144 #endif /* SLJIT_CONFIG_X86_32 */
145 
146 #define TMP_FREG	(0)
147 
148 /* Size flags for emit_x86_instruction: */
149 #define EX86_BIN_INS		0x0010
150 #define EX86_SHIFT_INS		0x0020
151 #define EX86_REX		0x0040
152 #define EX86_NO_REXW		0x0080
153 #define EX86_BYTE_ARG		0x0100
154 #define EX86_HALF_ARG		0x0200
155 #define EX86_PREF_66		0x0400
156 #define EX86_PREF_F2		0x0800
157 #define EX86_PREF_F3		0x1000
158 #define EX86_SSE2_OP1		0x2000
159 #define EX86_SSE2_OP2		0x4000
160 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
161 
162 /* --------------------------------------------------------------------- */
163 /*  Instrucion forms                                                     */
164 /* --------------------------------------------------------------------- */
165 
166 #define ADD		(/* BINARY */ 0 << 3)
167 #define ADD_EAX_i32	0x05
168 #define ADD_r_rm	0x03
169 #define ADD_rm_r	0x01
170 #define ADDSD_x_xm	0x58
171 #define ADC		(/* BINARY */ 2 << 3)
172 #define ADC_EAX_i32	0x15
173 #define ADC_r_rm	0x13
174 #define ADC_rm_r	0x11
175 #define AND		(/* BINARY */ 4 << 3)
176 #define AND_EAX_i32	0x25
177 #define AND_r_rm	0x23
178 #define AND_rm_r	0x21
179 #define ANDPD_x_xm	0x54
180 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
181 #define CALL_i32	0xe8
182 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
183 #define CDQ		0x99
184 #define CMOVE_r_rm	(/* GROUP_0F */ 0x44)
185 #define CMP		(/* BINARY */ 7 << 3)
186 #define CMP_EAX_i32	0x3d
187 #define CMP_r_rm	0x3b
188 #define CMP_rm_r	0x39
189 #define CVTPD2PS_x_xm	0x5a
190 #define CVTSI2SD_x_rm	0x2a
191 #define CVTTSD2SI_r_xm	0x2c
192 #define DIV		(/* GROUP_F7 */ 6 << 3)
193 #define DIVSD_x_xm	0x5e
194 #define FSTPS		0xd9
195 #define FSTPD		0xdd
196 #define INT3		0xcc
197 #define IDIV		(/* GROUP_F7 */ 7 << 3)
198 #define IMUL		(/* GROUP_F7 */ 5 << 3)
199 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
200 #define IMUL_r_rm_i8	0x6b
201 #define IMUL_r_rm_i32	0x69
202 #define JE_i8		0x74
203 #define JNE_i8		0x75
204 #define JMP_i8		0xeb
205 #define JMP_i32		0xe9
206 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
207 #define LEA_r_m		0x8d
208 #define MOV_r_rm	0x8b
209 #define MOV_r_i32	0xb8
210 #define MOV_rm_r	0x89
211 #define MOV_rm_i32	0xc7
212 #define MOV_rm8_i8	0xc6
213 #define MOV_rm8_r8	0x88
214 #define MOVSD_x_xm	0x10
215 #define MOVSD_xm_x	0x11
216 #define MOVSXD_r_rm	0x63
217 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
218 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
219 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
220 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
221 #define MUL		(/* GROUP_F7 */ 4 << 3)
222 #define MULSD_x_xm	0x59
223 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
224 #define NOP		0x90
225 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
226 #define OR		(/* BINARY */ 1 << 3)
227 #define OR_r_rm		0x0b
228 #define OR_EAX_i32	0x0d
229 #define OR_rm_r		0x09
230 #define OR_rm8_r8	0x08
231 #define POP_r		0x58
232 #define POP_rm		0x8f
233 #define POPF		0x9d
234 #define PREFETCH	0x18
235 #define PUSH_i32	0x68
236 #define PUSH_r		0x50
237 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
238 #define PUSHF		0x9c
239 #define RET_near	0xc3
240 #define RET_i16		0xc2
241 #define SBB		(/* BINARY */ 3 << 3)
242 #define SBB_EAX_i32	0x1d
243 #define SBB_r_rm	0x1b
244 #define SBB_rm_r	0x19
245 #define SAR		(/* SHIFT */ 7 << 3)
246 #define SHL		(/* SHIFT */ 4 << 3)
247 #define SHR		(/* SHIFT */ 5 << 3)
248 #define SUB		(/* BINARY */ 5 << 3)
249 #define SUB_EAX_i32	0x2d
250 #define SUB_r_rm	0x2b
251 #define SUB_rm_r	0x29
252 #define SUBSD_x_xm	0x5c
253 #define TEST_EAX_i32	0xa9
254 #define TEST_rm_r	0x85
255 #define UCOMISD_x_xm	0x2e
256 #define UNPCKLPD_x_xm	0x14
257 #define XCHG_EAX_r	0x90
258 #define XCHG_r_rm	0x87
259 #define XOR		(/* BINARY */ 6 << 3)
260 #define XOR_EAX_i32	0x35
261 #define XOR_r_rm	0x33
262 #define XOR_rm_r	0x31
263 #define XORPD_x_xm	0x57
264 
265 #define GROUP_0F	0x0f
266 #define GROUP_F7	0xf7
267 #define GROUP_FF	0xff
268 #define GROUP_BINARY_81	0x81
269 #define GROUP_BINARY_83	0x83
270 #define GROUP_SHIFT_1	0xd1
271 #define GROUP_SHIFT_N	0xc1
272 #define GROUP_SHIFT_CL	0xd3
273 
274 #define MOD_REG		0xc0
275 #define MOD_DISP8	0x40
276 
277 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
278 
279 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
280 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
281 #define RET()				(*inst++ = (RET_near))
282 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
283 /* r32, r/m32 */
284 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
285 
286 /* Multithreading does not affect these static variables, since they store
287    built-in CPU features. Therefore they can be overwritten by different threads
288    if they detect the CPU features in the same time. */
289 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
290 static sljit_s32 cpu_has_sse2 = -1;
291 #endif
292 static sljit_s32 cpu_has_cmov = -1;
293 
294 #ifdef _WIN32_WCE
295 #include <cmnintrin.h>
296 #elif defined(_MSC_VER) && _MSC_VER >= 1400
297 #include <intrin.h>
298 #endif
299 
300 /******************************************************/
301 /*    Unaligned-store functions                       */
302 /******************************************************/
303 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)304 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
305 {
306 	SLJIT_MEMCPY(addr, &value, sizeof(value));
307 }
308 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)309 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
310 {
311 	SLJIT_MEMCPY(addr, &value, sizeof(value));
312 }
313 
sljit_unaligned_store_sw(void * addr,sljit_sw value)314 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
315 {
316 	SLJIT_MEMCPY(addr, &value, sizeof(value));
317 }
318 
319 /******************************************************/
320 /*    Utility functions                               */
321 /******************************************************/
322 
get_cpu_features(void)323 static void get_cpu_features(void)
324 {
325 	sljit_u32 features;
326 
327 #if defined(_MSC_VER) && _MSC_VER >= 1400
328 
329 	int CPUInfo[4];
330 	__cpuid(CPUInfo, 1);
331 	features = (sljit_u32)CPUInfo[3];
332 
333 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
334 
335 	/* AT&T syntax. */
336 	__asm__ (
337 		"movl $0x1, %%eax\n"
338 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
339 		/* On x86-32, there is no red zone, so this
340 		   should work (no need for a local variable). */
341 		"push %%ebx\n"
342 #endif
343 		"cpuid\n"
344 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
345 		"pop %%ebx\n"
346 #endif
347 		"movl %%edx, %0\n"
348 		: "=g" (features)
349 		:
350 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
351 		: "%eax", "%ecx", "%edx"
352 #else
353 		: "%rax", "%rbx", "%rcx", "%rdx"
354 #endif
355 	);
356 
357 #else /* _MSC_VER && _MSC_VER >= 1400 */
358 
359 	/* Intel syntax. */
360 	__asm {
361 		mov eax, 1
362 		cpuid
363 		mov features, edx
364 	}
365 
366 #endif /* _MSC_VER && _MSC_VER >= 1400 */
367 
368 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
369 	cpu_has_sse2 = (features >> 26) & 0x1;
370 #endif
371 	cpu_has_cmov = (features >> 15) & 0x1;
372 }
373 
get_jump_code(sljit_s32 type)374 static sljit_u8 get_jump_code(sljit_s32 type)
375 {
376 	switch (type) {
377 	case SLJIT_EQUAL:
378 	case SLJIT_EQUAL_F64:
379 		return 0x84 /* je */;
380 
381 	case SLJIT_NOT_EQUAL:
382 	case SLJIT_NOT_EQUAL_F64:
383 		return 0x85 /* jne */;
384 
385 	case SLJIT_LESS:
386 	case SLJIT_LESS_F64:
387 		return 0x82 /* jc */;
388 
389 	case SLJIT_GREATER_EQUAL:
390 	case SLJIT_GREATER_EQUAL_F64:
391 		return 0x83 /* jae */;
392 
393 	case SLJIT_GREATER:
394 	case SLJIT_GREATER_F64:
395 		return 0x87 /* jnbe */;
396 
397 	case SLJIT_LESS_EQUAL:
398 	case SLJIT_LESS_EQUAL_F64:
399 		return 0x86 /* jbe */;
400 
401 	case SLJIT_SIG_LESS:
402 		return 0x8c /* jl */;
403 
404 	case SLJIT_SIG_GREATER_EQUAL:
405 		return 0x8d /* jnl */;
406 
407 	case SLJIT_SIG_GREATER:
408 		return 0x8f /* jnle */;
409 
410 	case SLJIT_SIG_LESS_EQUAL:
411 		return 0x8e /* jle */;
412 
413 	case SLJIT_OVERFLOW:
414 	case SLJIT_MUL_OVERFLOW:
415 		return 0x80 /* jo */;
416 
417 	case SLJIT_NOT_OVERFLOW:
418 	case SLJIT_MUL_NOT_OVERFLOW:
419 		return 0x81 /* jno */;
420 
421 	case SLJIT_UNORDERED_F64:
422 		return 0x8a /* jp */;
423 
424 	case SLJIT_ORDERED_F64:
425 		return 0x8b /* jpo */;
426 	}
427 	return 0;
428 }
429 
430 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
431 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_sw executable_offset);
432 #else
433 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr);
434 static sljit_u8* generate_put_label_code(struct sljit_put_label *put_label, sljit_u8 *code_ptr, sljit_uw max_label);
435 #endif
436 
generate_near_jump_code(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_sw executable_offset)437 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset)
438 {
439 	sljit_s32 type = jump->flags >> TYPE_SHIFT;
440 	sljit_s32 short_jump;
441 	sljit_uw label_addr;
442 
443 	if (jump->flags & JUMP_LABEL)
444 		label_addr = (sljit_uw)(code + jump->u.label->size);
445 	else
446 		label_addr = jump->u.target - executable_offset;
447 
448 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
449 
450 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
451 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
452 		return generate_far_jump_code(jump, code_ptr);
453 #endif
454 
455 	if (type == SLJIT_JUMP) {
456 		if (short_jump)
457 			*code_ptr++ = JMP_i8;
458 		else
459 			*code_ptr++ = JMP_i32;
460 		jump->addr++;
461 	}
462 	else if (type >= SLJIT_FAST_CALL) {
463 		short_jump = 0;
464 		*code_ptr++ = CALL_i32;
465 		jump->addr++;
466 	}
467 	else if (short_jump) {
468 		*code_ptr++ = get_jump_code(type) - 0x10;
469 		jump->addr++;
470 	}
471 	else {
472 		*code_ptr++ = GROUP_0F;
473 		*code_ptr++ = get_jump_code(type);
474 		jump->addr += 2;
475 	}
476 
477 	if (short_jump) {
478 		jump->flags |= PATCH_MB;
479 		code_ptr += sizeof(sljit_s8);
480 	} else {
481 		jump->flags |= PATCH_MW;
482 		code_ptr += sizeof(sljit_s32);
483 	}
484 
485 	return code_ptr;
486 }
487 
sljit_generate_code(struct sljit_compiler * compiler)488 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
489 {
490 	struct sljit_memory_fragment *buf;
491 	sljit_u8 *code;
492 	sljit_u8 *code_ptr;
493 	sljit_u8 *buf_ptr;
494 	sljit_u8 *buf_end;
495 	sljit_u8 len;
496 	sljit_sw executable_offset;
497 	sljit_sw jump_addr;
498 
499 	struct sljit_label *label;
500 	struct sljit_jump *jump;
501 	struct sljit_const *const_;
502 	struct sljit_put_label *put_label;
503 
504 	CHECK_ERROR_PTR();
505 	CHECK_PTR(check_sljit_generate_code(compiler));
506 	reverse_buf(compiler);
507 
508 	/* Second code generation pass. */
509 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size, compiler->exec_allocator_data);
510 	PTR_FAIL_WITH_EXEC_IF(code);
511 	buf = compiler->buf;
512 
513 	code_ptr = code;
514 	label = compiler->labels;
515 	jump = compiler->jumps;
516 	const_ = compiler->consts;
517 	put_label = compiler->put_labels;
518 	executable_offset = SLJIT_EXEC_OFFSET(code);
519 
520 	do {
521 		buf_ptr = buf->memory;
522 		buf_end = buf_ptr + buf->used_size;
523 		do {
524 			len = *buf_ptr++;
525 			if (len > 0) {
526 				/* The code is already generated. */
527 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
528 				code_ptr += len;
529 				buf_ptr += len;
530 			}
531 			else {
532 				switch (*buf_ptr) {
533 				case 0:
534 					label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
535 					label->size = code_ptr - code;
536 					label = label->next;
537 					break;
538 				case 1:
539 					jump->addr = (sljit_uw)code_ptr;
540 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
541 						code_ptr = generate_near_jump_code(jump, code_ptr, code, executable_offset);
542 					else {
543 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
544 						code_ptr = generate_far_jump_code(jump, code_ptr, executable_offset);
545 #else
546 						code_ptr = generate_far_jump_code(jump, code_ptr);
547 #endif
548 					}
549 					jump = jump->next;
550 					break;
551 				case 2:
552 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
553 					const_ = const_->next;
554 					break;
555 				default:
556 					SLJIT_ASSERT(*buf_ptr == 3);
557 					SLJIT_ASSERT(put_label->label);
558 					put_label->addr = (sljit_uw)code_ptr;
559 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
560 					code_ptr = generate_put_label_code(put_label, code_ptr, (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code, executable_offset) + put_label->label->size);
561 #endif
562 					put_label = put_label->next;
563 					break;
564 				}
565 				buf_ptr++;
566 			}
567 		} while (buf_ptr < buf_end);
568 		SLJIT_ASSERT(buf_ptr == buf_end);
569 		buf = buf->next;
570 	} while (buf);
571 
572 	SLJIT_ASSERT(!label);
573 	SLJIT_ASSERT(!jump);
574 	SLJIT_ASSERT(!const_);
575 	SLJIT_ASSERT(!put_label);
576 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
577 
578 	jump = compiler->jumps;
579 	while (jump) {
580 		jump_addr = jump->addr + executable_offset;
581 
582 		if (jump->flags & PATCH_MB) {
583 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
584 			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
585 		} else if (jump->flags & PATCH_MW) {
586 			if (jump->flags & JUMP_LABEL) {
587 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
588 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
589 #else
590 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
591 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
592 #endif
593 			}
594 			else {
595 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
596 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
597 #else
598 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
599 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
600 #endif
601 			}
602 		}
603 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
604 		else if (jump->flags & PATCH_MD)
605 			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
606 #endif
607 
608 		jump = jump->next;
609 	}
610 
611 	put_label = compiler->put_labels;
612 	while (put_label) {
613 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
614 		sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
615 #else
616 		if (put_label->flags & PATCH_MD) {
617 			SLJIT_ASSERT(put_label->label->addr > HALFWORD_MAX);
618 			sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
619 		}
620 		else {
621 			SLJIT_ASSERT(put_label->label->addr <= HALFWORD_MAX);
622 			sljit_unaligned_store_s32((void*)(put_label->addr - sizeof(sljit_s32)), (sljit_s32)put_label->label->addr);
623 		}
624 #endif
625 
626 		put_label = put_label->next;
627 	}
628 
629 	compiler->error = SLJIT_ERR_COMPILED;
630 	compiler->executable_offset = executable_offset;
631 	compiler->executable_size = code_ptr - code;
632 
633 	code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
634 
635 	SLJIT_UPDATE_WX_FLAGS(code, (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset), 1);
636 	return (void*)code;
637 }
638 
sljit_has_cpu_feature(sljit_s32 feature_type)639 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
640 {
641 	switch (feature_type) {
642 	case SLJIT_HAS_FPU:
643 #ifdef SLJIT_IS_FPU_AVAILABLE
644 		return SLJIT_IS_FPU_AVAILABLE;
645 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
646 		if (cpu_has_sse2 == -1)
647 			get_cpu_features();
648 		return cpu_has_sse2;
649 #else /* SLJIT_DETECT_SSE2 */
650 		return 1;
651 #endif /* SLJIT_DETECT_SSE2 */
652 
653 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
654 	case SLJIT_HAS_VIRTUAL_REGISTERS:
655 		return 1;
656 #endif
657 
658 	case SLJIT_HAS_CLZ:
659 	case SLJIT_HAS_CMOV:
660 		if (cpu_has_cmov == -1)
661 			get_cpu_features();
662 		return cpu_has_cmov;
663 
664 	case SLJIT_HAS_PREFETCH:
665 		return 1;
666 
667 	case SLJIT_HAS_SSE2:
668 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
669 		if (cpu_has_sse2 == -1)
670 			get_cpu_features();
671 		return cpu_has_sse2;
672 #else
673 		return 1;
674 #endif
675 
676 	default:
677 		return 0;
678 	}
679 }
680 
681 /* --------------------------------------------------------------------- */
682 /*  Operators                                                            */
683 /* --------------------------------------------------------------------- */
684 
685 #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
686 
687 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
688 	sljit_u32 op_types,
689 	sljit_s32 dst, sljit_sw dstw,
690 	sljit_s32 src1, sljit_sw src1w,
691 	sljit_s32 src2, sljit_sw src2w);
692 
693 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
694 	sljit_u32 op_types,
695 	sljit_s32 dst, sljit_sw dstw,
696 	sljit_s32 src1, sljit_sw src1w,
697 	sljit_s32 src2, sljit_sw src2w);
698 
699 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
700 	sljit_s32 dst, sljit_sw dstw,
701 	sljit_s32 src, sljit_sw srcw);
702 
703 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
704 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
705 
706 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
707 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
708 
709 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
710 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
711 
712 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
713 	sljit_s32 src1, sljit_sw src1w,
714 	sljit_s32 src2, sljit_sw src2w);
715 
emit_endbranch(struct sljit_compiler * compiler)716 static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
717 {
718 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET)
719 	/* Emit endbr32/endbr64 when CET is enabled.  */
720 	sljit_u8 *inst;
721 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
722 	FAIL_IF(!inst);
723 	INC_SIZE(4);
724 	*inst++ = 0xf3;
725 	*inst++ = 0x0f;
726 	*inst++ = 0x1e;
727 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
728 	*inst = 0xfb;
729 #else
730 	*inst = 0xfa;
731 #endif
732 #else /* !SLJIT_CONFIG_X86_CET */
733 	SLJIT_UNUSED_ARG(compiler);
734 #endif /* SLJIT_CONFIG_X86_CET */
735 	return SLJIT_SUCCESS;
736 }
737 
738 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
739 
emit_rdssp(struct sljit_compiler * compiler,sljit_s32 reg)740 static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_s32 reg)
741 {
742 	sljit_u8 *inst;
743 	sljit_s32 size;
744 
745 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
746 	size = 5;
747 #else
748 	size = 4;
749 #endif
750 
751 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
752 	FAIL_IF(!inst);
753 	INC_SIZE(size);
754 	*inst++ = 0xf3;
755 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
756 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
757 #endif
758 	*inst++ = 0x0f;
759 	*inst++ = 0x1e;
760 	*inst = (0x3 << 6) | (0x1 << 3) | (reg_map[reg] & 0x7);
761 	return SLJIT_SUCCESS;
762 }
763 
emit_incssp(struct sljit_compiler * compiler,sljit_s32 reg)764 static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit_s32 reg)
765 {
766 	sljit_u8 *inst;
767 	sljit_s32 size;
768 
769 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
770 	size = 5;
771 #else
772 	size = 4;
773 #endif
774 
775 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
776 	FAIL_IF(!inst);
777 	INC_SIZE(size);
778 	*inst++ = 0xf3;
779 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
780 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
781 #endif
782 	*inst++ = 0x0f;
783 	*inst++ = 0xae;
784 	*inst = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
785 	return SLJIT_SUCCESS;
786 }
787 
788 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
789 
cpu_has_shadow_stack(void)790 static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void)
791 {
792 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
793 	return _get_ssp() != 0;
794 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
795 	return 0;
796 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
797 }
798 
adjust_shadow_stack(struct sljit_compiler * compiler,sljit_s32 src,sljit_sw srcw,sljit_s32 base,sljit_sw disp)799 static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler,
800 	sljit_s32 src, sljit_sw srcw, sljit_s32 base, sljit_sw disp)
801 {
802 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
803 	sljit_u8 *inst, *jz_after_cmp_inst;
804 	sljit_uw size_jz_after_cmp_inst;
805 
806 	sljit_uw size_before_rdssp_inst = compiler->size;
807 
808 	/* Generate "RDSSP TMP_REG1". */
809 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
810 
811 	/* Load return address on shadow stack into TMP_REG1. */
812 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
813 	SLJIT_ASSERT(reg_map[TMP_REG1] == 5);
814 
815 	/* Hand code unsupported "mov 0x0(%ebp),%ebp". */
816 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
817 	FAIL_IF(!inst);
818 	INC_SIZE(3);
819 	*inst++ = 0x8b;
820 	*inst++ = 0x6d;
821 	*inst = 0;
822 #else /* !SLJIT_CONFIG_X86_32 */
823 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
824 #endif /* SLJIT_CONFIG_X86_32 */
825 
826 	if (src == SLJIT_UNUSED) {
827 		/* Return address is on stack.  */
828 		src = SLJIT_MEM1(base);
829 		srcw = disp;
830 	}
831 
832 	/* Compare return address against TMP_REG1. */
833 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
834 
835 	/* Generate JZ to skip shadow stack ajdustment when shadow
836 	   stack matches normal stack. */
837 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
838 	FAIL_IF(!inst);
839 	INC_SIZE(2);
840 	*inst++ = get_jump_code(SLJIT_EQUAL) - 0x10;
841 	size_jz_after_cmp_inst = compiler->size;
842 	jz_after_cmp_inst = inst;
843 
844 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
845 	/* REX_W is not necessary. */
846 	compiler->mode32 = 1;
847 #endif
848 	/* Load 1 into TMP_REG1. */
849 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
850 
851 	/* Generate "INCSSP TMP_REG1". */
852 	FAIL_IF(emit_incssp(compiler, TMP_REG1));
853 
854 	/* Jump back to "RDSSP TMP_REG1" to check shadow stack again. */
855 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
856 	FAIL_IF(!inst);
857 	INC_SIZE(2);
858 	*inst++ = JMP_i8;
859 	*inst = size_before_rdssp_inst - compiler->size;
860 
861 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
862 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
863 	SLJIT_UNUSED_ARG(compiler);
864 	SLJIT_UNUSED_ARG(src);
865 	SLJIT_UNUSED_ARG(srcw);
866 	SLJIT_UNUSED_ARG(base);
867 	SLJIT_UNUSED_ARG(disp);
868 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
869 	return SLJIT_SUCCESS;
870 }
871 
872 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
873 #include "sljitNativeX86_32.c"
874 #else
875 #include "sljitNativeX86_64.c"
876 #endif
877 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)878 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
879 	sljit_s32 dst, sljit_sw dstw,
880 	sljit_s32 src, sljit_sw srcw)
881 {
882 	sljit_u8* inst;
883 
884 	SLJIT_ASSERT(dst != SLJIT_UNUSED);
885 
886 	if (FAST_IS_REG(src)) {
887 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
888 		FAIL_IF(!inst);
889 		*inst = MOV_rm_r;
890 		return SLJIT_SUCCESS;
891 	}
892 	if (src & SLJIT_IMM) {
893 		if (FAST_IS_REG(dst)) {
894 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
895 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
896 #else
897 			if (!compiler->mode32) {
898 				if (NOT_HALFWORD(srcw))
899 					return emit_load_imm64(compiler, dst, srcw);
900 			}
901 			else
902 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
903 #endif
904 		}
905 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
906 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
907 			/* Immediate to memory move. Only SLJIT_MOV operation copies
908 			   an immediate directly into memory so TMP_REG1 can be used. */
909 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
910 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
911 			FAIL_IF(!inst);
912 			*inst = MOV_rm_r;
913 			return SLJIT_SUCCESS;
914 		}
915 #endif
916 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
917 		FAIL_IF(!inst);
918 		*inst = MOV_rm_i32;
919 		return SLJIT_SUCCESS;
920 	}
921 	if (FAST_IS_REG(dst)) {
922 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
923 		FAIL_IF(!inst);
924 		*inst = MOV_r_rm;
925 		return SLJIT_SUCCESS;
926 	}
927 
928 	/* Memory to memory move. Only SLJIT_MOV operation copies
929 	   data from memory to memory so TMP_REG1 can be used. */
930 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
931 	FAIL_IF(!inst);
932 	*inst = MOV_r_rm;
933 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
934 	FAIL_IF(!inst);
935 	*inst = MOV_rm_r;
936 	return SLJIT_SUCCESS;
937 }
938 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)939 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
940 {
941 	sljit_u8 *inst;
942 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
943 	sljit_s32 size;
944 #endif
945 
946 	CHECK_ERROR();
947 	CHECK(check_sljit_emit_op0(compiler, op));
948 
949 	switch (GET_OPCODE(op)) {
950 	case SLJIT_BREAKPOINT:
951 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
952 		FAIL_IF(!inst);
953 		INC_SIZE(1);
954 		*inst = INT3;
955 		break;
956 	case SLJIT_NOP:
957 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
958 		FAIL_IF(!inst);
959 		INC_SIZE(1);
960 		*inst = NOP;
961 		break;
962 	case SLJIT_LMUL_UW:
963 	case SLJIT_LMUL_SW:
964 	case SLJIT_DIVMOD_UW:
965 	case SLJIT_DIVMOD_SW:
966 	case SLJIT_DIV_UW:
967 	case SLJIT_DIV_SW:
968 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
969 #ifdef _WIN64
970 		SLJIT_ASSERT(
971 			reg_map[SLJIT_R0] == 0
972 			&& reg_map[SLJIT_R1] == 2
973 			&& reg_map[TMP_REG1] > 7);
974 #else
975 		SLJIT_ASSERT(
976 			reg_map[SLJIT_R0] == 0
977 			&& reg_map[SLJIT_R1] < 7
978 			&& reg_map[TMP_REG1] == 2);
979 #endif
980 		compiler->mode32 = op & SLJIT_I32_OP;
981 #endif
982 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
983 
984 		op = GET_OPCODE(op);
985 		if ((op | 0x2) == SLJIT_DIV_UW) {
986 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
987 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
988 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
989 #else
990 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
991 #endif
992 			FAIL_IF(!inst);
993 			*inst = XOR_r_rm;
994 		}
995 
996 		if ((op | 0x2) == SLJIT_DIV_SW) {
997 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
998 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
999 #endif
1000 
1001 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1002 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1003 			FAIL_IF(!inst);
1004 			INC_SIZE(1);
1005 			*inst = CDQ;
1006 #else
1007 			if (compiler->mode32) {
1008 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1009 				FAIL_IF(!inst);
1010 				INC_SIZE(1);
1011 				*inst = CDQ;
1012 			} else {
1013 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1014 				FAIL_IF(!inst);
1015 				INC_SIZE(2);
1016 				*inst++ = REX_W;
1017 				*inst = CDQ;
1018 			}
1019 #endif
1020 		}
1021 
1022 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1023 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1024 		FAIL_IF(!inst);
1025 		INC_SIZE(2);
1026 		*inst++ = GROUP_F7;
1027 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
1028 #else
1029 #ifdef _WIN64
1030 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
1031 #else
1032 		size = (!compiler->mode32) ? 3 : 2;
1033 #endif
1034 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1035 		FAIL_IF(!inst);
1036 		INC_SIZE(size);
1037 #ifdef _WIN64
1038 		if (!compiler->mode32)
1039 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
1040 		else if (op >= SLJIT_DIVMOD_UW)
1041 			*inst++ = REX_B;
1042 		*inst++ = GROUP_F7;
1043 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
1044 #else
1045 		if (!compiler->mode32)
1046 			*inst++ = REX_W;
1047 		*inst++ = GROUP_F7;
1048 		*inst = MOD_REG | reg_map[SLJIT_R1];
1049 #endif
1050 #endif
1051 		switch (op) {
1052 		case SLJIT_LMUL_UW:
1053 			*inst |= MUL;
1054 			break;
1055 		case SLJIT_LMUL_SW:
1056 			*inst |= IMUL;
1057 			break;
1058 		case SLJIT_DIVMOD_UW:
1059 		case SLJIT_DIV_UW:
1060 			*inst |= DIV;
1061 			break;
1062 		case SLJIT_DIVMOD_SW:
1063 		case SLJIT_DIV_SW:
1064 			*inst |= IDIV;
1065 			break;
1066 		}
1067 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
1068 		if (op <= SLJIT_DIVMOD_SW)
1069 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1070 #else
1071 		if (op >= SLJIT_DIV_UW)
1072 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1073 #endif
1074 		break;
1075 	case SLJIT_ENDBR:
1076 		return emit_endbranch(compiler);
1077 	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
1078 		return skip_frames_before_return(compiler);
1079 	}
1080 
1081 	return SLJIT_SUCCESS;
1082 }
1083 
1084 #define ENCODE_PREFIX(prefix) \
1085 	do { \
1086 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
1087 		FAIL_IF(!inst); \
1088 		INC_SIZE(1); \
1089 		*inst = (prefix); \
1090 	} while (0)
1091 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1092 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
1093 	sljit_s32 dst, sljit_sw dstw,
1094 	sljit_s32 src, sljit_sw srcw)
1095 {
1096 	sljit_u8* inst;
1097 	sljit_s32 dst_r;
1098 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1099 	sljit_s32 work_r;
1100 #endif
1101 
1102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1103 	compiler->mode32 = 0;
1104 #endif
1105 
1106 	if (src & SLJIT_IMM) {
1107 		if (FAST_IS_REG(dst)) {
1108 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1109 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1110 #else
1111 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1112 			FAIL_IF(!inst);
1113 			*inst = MOV_rm_i32;
1114 			return SLJIT_SUCCESS;
1115 #endif
1116 		}
1117 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
1118 		FAIL_IF(!inst);
1119 		*inst = MOV_rm8_i8;
1120 		return SLJIT_SUCCESS;
1121 	}
1122 
1123 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1124 
1125 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
1126 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1127 		if (reg_map[src] >= 4) {
1128 			SLJIT_ASSERT(dst_r == TMP_REG1);
1129 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1130 		} else
1131 			dst_r = src;
1132 #else
1133 		dst_r = src;
1134 #endif
1135 	}
1136 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1137 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
1138 		/* src, dst are registers. */
1139 		SLJIT_ASSERT(SLOW_IS_REG(dst));
1140 		if (reg_map[dst] < 4) {
1141 			if (dst != src)
1142 				EMIT_MOV(compiler, dst, 0, src, 0);
1143 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
1144 			FAIL_IF(!inst);
1145 			*inst++ = GROUP_0F;
1146 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
1147 		}
1148 		else {
1149 			if (dst != src)
1150 				EMIT_MOV(compiler, dst, 0, src, 0);
1151 			if (sign) {
1152 				/* shl reg, 24 */
1153 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
1154 				FAIL_IF(!inst);
1155 				*inst |= SHL;
1156 				/* sar reg, 24 */
1157 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
1158 				FAIL_IF(!inst);
1159 				*inst |= SAR;
1160 			}
1161 			else {
1162 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
1163 				FAIL_IF(!inst);
1164 				*(inst + 1) |= AND;
1165 			}
1166 		}
1167 		return SLJIT_SUCCESS;
1168 	}
1169 #endif
1170 	else {
1171 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
1172 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1173 		FAIL_IF(!inst);
1174 		*inst++ = GROUP_0F;
1175 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
1176 	}
1177 
1178 	if (dst & SLJIT_MEM) {
1179 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1180 		if (dst_r == TMP_REG1) {
1181 			/* Find a non-used register, whose reg_map[src] < 4. */
1182 			if ((dst & REG_MASK) == SLJIT_R0) {
1183 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
1184 					work_r = SLJIT_R2;
1185 				else
1186 					work_r = SLJIT_R1;
1187 			}
1188 			else {
1189 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1190 					work_r = SLJIT_R0;
1191 				else if ((dst & REG_MASK) == SLJIT_R1)
1192 					work_r = SLJIT_R2;
1193 				else
1194 					work_r = SLJIT_R1;
1195 			}
1196 
1197 			if (work_r == SLJIT_R0) {
1198 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1199 			}
1200 			else {
1201 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1202 				FAIL_IF(!inst);
1203 				*inst = XCHG_r_rm;
1204 			}
1205 
1206 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
1207 			FAIL_IF(!inst);
1208 			*inst = MOV_rm8_r8;
1209 
1210 			if (work_r == SLJIT_R0) {
1211 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1212 			}
1213 			else {
1214 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1215 				FAIL_IF(!inst);
1216 				*inst = XCHG_r_rm;
1217 			}
1218 		}
1219 		else {
1220 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1221 			FAIL_IF(!inst);
1222 			*inst = MOV_rm8_r8;
1223 		}
1224 #else
1225 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1226 		FAIL_IF(!inst);
1227 		*inst = MOV_rm8_r8;
1228 #endif
1229 	}
1230 
1231 	return SLJIT_SUCCESS;
1232 }
1233 
emit_prefetch(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 src,sljit_sw srcw)1234 static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1235 	sljit_s32 src, sljit_sw srcw)
1236 {
1237 	sljit_u8* inst;
1238 
1239 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1240 	compiler->mode32 = 1;
1241 #endif
1242 
1243 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1244 	FAIL_IF(!inst);
1245 	*inst++ = GROUP_0F;
1246 	*inst++ = PREFETCH;
1247 
1248 	if (op == SLJIT_PREFETCH_L1)
1249 		*inst |= (1 << 3);
1250 	else if (op == SLJIT_PREFETCH_L2)
1251 		*inst |= (2 << 3);
1252 	else if (op == SLJIT_PREFETCH_L3)
1253 		*inst |= (3 << 3);
1254 
1255 	return SLJIT_SUCCESS;
1256 }
1257 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1258 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1259 	sljit_s32 dst, sljit_sw dstw,
1260 	sljit_s32 src, sljit_sw srcw)
1261 {
1262 	sljit_u8* inst;
1263 	sljit_s32 dst_r;
1264 
1265 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1266 	compiler->mode32 = 0;
1267 #endif
1268 
1269 	if (src & SLJIT_IMM) {
1270 		if (FAST_IS_REG(dst)) {
1271 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1272 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1273 #else
1274 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1275 			FAIL_IF(!inst);
1276 			*inst = MOV_rm_i32;
1277 			return SLJIT_SUCCESS;
1278 #endif
1279 		}
1280 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1281 		FAIL_IF(!inst);
1282 		*inst = MOV_rm_i32;
1283 		return SLJIT_SUCCESS;
1284 	}
1285 
1286 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1287 
1288 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1289 		dst_r = src;
1290 	else {
1291 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1292 		FAIL_IF(!inst);
1293 		*inst++ = GROUP_0F;
1294 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1295 	}
1296 
1297 	if (dst & SLJIT_MEM) {
1298 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1299 		FAIL_IF(!inst);
1300 		*inst = MOV_rm_r;
1301 	}
1302 
1303 	return SLJIT_SUCCESS;
1304 }
1305 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1306 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1307 	sljit_s32 dst, sljit_sw dstw,
1308 	sljit_s32 src, sljit_sw srcw)
1309 {
1310 	sljit_u8* inst;
1311 
1312 	if (dst == src && dstw == srcw) {
1313 		/* Same input and output */
1314 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1315 		FAIL_IF(!inst);
1316 		*inst++ = GROUP_F7;
1317 		*inst |= opcode;
1318 		return SLJIT_SUCCESS;
1319 	}
1320 
1321 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
1322 		dst = TMP_REG1;
1323 
1324 	if (FAST_IS_REG(dst)) {
1325 		EMIT_MOV(compiler, dst, 0, src, srcw);
1326 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1327 		FAIL_IF(!inst);
1328 		*inst++ = GROUP_F7;
1329 		*inst |= opcode;
1330 		return SLJIT_SUCCESS;
1331 	}
1332 
1333 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1334 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1335 	FAIL_IF(!inst);
1336 	*inst++ = GROUP_F7;
1337 	*inst |= opcode;
1338 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1339 	return SLJIT_SUCCESS;
1340 }
1341 
emit_not_with_flags(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1342 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1343 	sljit_s32 dst, sljit_sw dstw,
1344 	sljit_s32 src, sljit_sw srcw)
1345 {
1346 	sljit_u8* inst;
1347 
1348 	if (dst == SLJIT_UNUSED)
1349 		dst = TMP_REG1;
1350 
1351 	if (FAST_IS_REG(dst)) {
1352 		EMIT_MOV(compiler, dst, 0, src, srcw);
1353 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1354 		FAIL_IF(!inst);
1355 		*inst++ = GROUP_F7;
1356 		*inst |= NOT_rm;
1357 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1358 		FAIL_IF(!inst);
1359 		*inst = OR_r_rm;
1360 		return SLJIT_SUCCESS;
1361 	}
1362 
1363 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1364 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1365 	FAIL_IF(!inst);
1366 	*inst++ = GROUP_F7;
1367 	*inst |= NOT_rm;
1368 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1369 	FAIL_IF(!inst);
1370 	*inst = OR_r_rm;
1371 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1372 	return SLJIT_SUCCESS;
1373 }
1374 
1375 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1376 static const sljit_sw emit_clz_arg = 32 + 31;
1377 #endif
1378 
emit_clz(struct sljit_compiler * compiler,sljit_s32 op_flags,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1379 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1380 	sljit_s32 dst, sljit_sw dstw,
1381 	sljit_s32 src, sljit_sw srcw)
1382 {
1383 	sljit_u8* inst;
1384 	sljit_s32 dst_r;
1385 
1386 	SLJIT_UNUSED_ARG(op_flags);
1387 
1388 	if (cpu_has_cmov == -1)
1389 		get_cpu_features();
1390 
1391 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1392 
1393 	inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1394 	FAIL_IF(!inst);
1395 	*inst++ = GROUP_0F;
1396 	*inst = BSR_r_rm;
1397 
1398 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1399 	if (cpu_has_cmov) {
1400 		if (dst_r != TMP_REG1) {
1401 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
1402 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1403 		}
1404 		else
1405 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);
1406 
1407 		FAIL_IF(!inst);
1408 		*inst++ = GROUP_0F;
1409 		*inst = CMOVE_r_rm;
1410 	}
1411 	else
1412 		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));
1413 
1414 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1415 #else
1416 	if (cpu_has_cmov) {
1417 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31));
1418 
1419 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1420 		FAIL_IF(!inst);
1421 		*inst++ = GROUP_0F;
1422 		*inst = CMOVE_r_rm;
1423 	}
1424 	else
1425 		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31)));
1426 
1427 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1428 #endif
1429 
1430 	FAIL_IF(!inst);
1431 	*(inst + 1) |= XOR;
1432 
1433 	if (dst & SLJIT_MEM)
1434 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1435 	return SLJIT_SUCCESS;
1436 }
1437 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1438 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1439 	sljit_s32 dst, sljit_sw dstw,
1440 	sljit_s32 src, sljit_sw srcw)
1441 {
1442 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1443 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1444 	sljit_s32 dst_is_ereg = 0;
1445 #endif
1446 
1447 	CHECK_ERROR();
1448 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1449 	ADJUST_LOCAL_OFFSET(dst, dstw);
1450 	ADJUST_LOCAL_OFFSET(src, srcw);
1451 
1452 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1453 	CHECK_EXTRA_REGS(src, srcw, (void)0);
1454 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1455 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1456 #endif
1457 
1458 	op = GET_OPCODE(op);
1459 
1460 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1461 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1462 		compiler->mode32 = 0;
1463 #endif
1464 
1465 		if (FAST_IS_REG(src) && src == dst) {
1466 			if (!TYPE_CAST_NEEDED(op))
1467 				return SLJIT_SUCCESS;
1468 		}
1469 
1470 		if (op_flags & SLJIT_I32_OP) {
1471 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1472 			if (src & SLJIT_MEM) {
1473 				if (op == SLJIT_MOV_S32)
1474 					op = SLJIT_MOV_U32;
1475 			}
1476 			else if (src & SLJIT_IMM) {
1477 				if (op == SLJIT_MOV_U32)
1478 					op = SLJIT_MOV_S32;
1479 			}
1480 #endif
1481 		}
1482 
1483 		if (src & SLJIT_IMM) {
1484 			switch (op) {
1485 			case SLJIT_MOV_U8:
1486 				srcw = (sljit_u8)srcw;
1487 				break;
1488 			case SLJIT_MOV_S8:
1489 				srcw = (sljit_s8)srcw;
1490 				break;
1491 			case SLJIT_MOV_U16:
1492 				srcw = (sljit_u16)srcw;
1493 				break;
1494 			case SLJIT_MOV_S16:
1495 				srcw = (sljit_s16)srcw;
1496 				break;
1497 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1498 			case SLJIT_MOV_U32:
1499 				srcw = (sljit_u32)srcw;
1500 				break;
1501 			case SLJIT_MOV_S32:
1502 				srcw = (sljit_s32)srcw;
1503 				break;
1504 #endif
1505 			}
1506 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1507 			if (SLJIT_UNLIKELY(dst_is_ereg))
1508 				return emit_mov(compiler, dst, dstw, src, srcw);
1509 #endif
1510 		}
1511 
1512 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1513 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1514 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1515 			dst = TMP_REG1;
1516 		}
1517 #endif
1518 
1519 		switch (op) {
1520 		case SLJIT_MOV:
1521 		case SLJIT_MOV_P:
1522 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1523 		case SLJIT_MOV_U32:
1524 		case SLJIT_MOV_S32:
1525 #endif
1526 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1527 			break;
1528 		case SLJIT_MOV_U8:
1529 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1530 			break;
1531 		case SLJIT_MOV_S8:
1532 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1533 			break;
1534 		case SLJIT_MOV_U16:
1535 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1536 			break;
1537 		case SLJIT_MOV_S16:
1538 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1539 			break;
1540 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1541 		case SLJIT_MOV_U32:
1542 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1543 			break;
1544 		case SLJIT_MOV_S32:
1545 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1546 			break;
1547 #endif
1548 		}
1549 
1550 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1551 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1552 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1553 #endif
1554 		return SLJIT_SUCCESS;
1555 	}
1556 
1557 	switch (op) {
1558 	case SLJIT_NOT:
1559 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
1560 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1561 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1562 
1563 	case SLJIT_NEG:
1564 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1565 
1566 	case SLJIT_CLZ:
1567 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1568 	}
1569 
1570 	return SLJIT_SUCCESS;
1571 }
1572 
1573 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1574 
1575 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1576 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1577 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1578 		FAIL_IF(!inst); \
1579 		*(inst + 1) |= (op_imm); \
1580 	} \
1581 	else { \
1582 		FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \
1583 		inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
1584 		FAIL_IF(!inst); \
1585 		*inst = (op_mr); \
1586 	}
1587 
1588 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1589 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1590 
1591 #else
1592 
1593 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1594 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1595 	FAIL_IF(!inst); \
1596 	*(inst + 1) |= (op_imm);
1597 
1598 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1599 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1600 
1601 #endif
1602 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u32 op_types,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1603 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1604 	sljit_u32 op_types,
1605 	sljit_s32 dst, sljit_sw dstw,
1606 	sljit_s32 src1, sljit_sw src1w,
1607 	sljit_s32 src2, sljit_sw src2w)
1608 {
1609 	sljit_u8* inst;
1610 	sljit_u8 op_eax_imm = (op_types >> 24);
1611 	sljit_u8 op_rm = (op_types >> 16) & 0xff;
1612 	sljit_u8 op_mr = (op_types >> 8) & 0xff;
1613 	sljit_u8 op_imm = op_types & 0xff;
1614 
1615 	if (dst == SLJIT_UNUSED) {
1616 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1617 		if (src2 & SLJIT_IMM) {
1618 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1619 		}
1620 		else {
1621 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1622 			FAIL_IF(!inst);
1623 			*inst = op_rm;
1624 		}
1625 		return SLJIT_SUCCESS;
1626 	}
1627 
1628 	if (dst == src1 && dstw == src1w) {
1629 		if (src2 & SLJIT_IMM) {
1630 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1631 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1632 #else
1633 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1634 #endif
1635 				BINARY_EAX_IMM(op_eax_imm, src2w);
1636 			}
1637 			else {
1638 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1639 			}
1640 		}
1641 		else if (FAST_IS_REG(dst)) {
1642 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1643 			FAIL_IF(!inst);
1644 			*inst = op_rm;
1645 		}
1646 		else if (FAST_IS_REG(src2)) {
1647 			/* Special exception for sljit_emit_op_flags. */
1648 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1649 			FAIL_IF(!inst);
1650 			*inst = op_mr;
1651 		}
1652 		else {
1653 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1654 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1655 			FAIL_IF(!inst);
1656 			*inst = op_mr;
1657 		}
1658 		return SLJIT_SUCCESS;
1659 	}
1660 
1661 	/* Only for cumulative operations. */
1662 	if (dst == src2 && dstw == src2w) {
1663 		if (src1 & SLJIT_IMM) {
1664 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1665 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1666 #else
1667 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1668 #endif
1669 				BINARY_EAX_IMM(op_eax_imm, src1w);
1670 			}
1671 			else {
1672 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1673 			}
1674 		}
1675 		else if (FAST_IS_REG(dst)) {
1676 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1677 			FAIL_IF(!inst);
1678 			*inst = op_rm;
1679 		}
1680 		else if (FAST_IS_REG(src1)) {
1681 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1682 			FAIL_IF(!inst);
1683 			*inst = op_mr;
1684 		}
1685 		else {
1686 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1687 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1688 			FAIL_IF(!inst);
1689 			*inst = op_mr;
1690 		}
1691 		return SLJIT_SUCCESS;
1692 	}
1693 
1694 	/* General version. */
1695 	if (FAST_IS_REG(dst)) {
1696 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1697 		if (src2 & SLJIT_IMM) {
1698 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1699 		}
1700 		else {
1701 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1702 			FAIL_IF(!inst);
1703 			*inst = op_rm;
1704 		}
1705 	}
1706 	else {
1707 		/* This version requires less memory writing. */
1708 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1709 		if (src2 & SLJIT_IMM) {
1710 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1711 		}
1712 		else {
1713 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1714 			FAIL_IF(!inst);
1715 			*inst = op_rm;
1716 		}
1717 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1718 	}
1719 
1720 	return SLJIT_SUCCESS;
1721 }
1722 
1723 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1724 	sljit_u32 op_types,
1725 	sljit_s32 dst, sljit_sw dstw,
1726 	sljit_s32 src1, sljit_sw src1w,
1727 	sljit_s32 src2, sljit_sw src2w)
1728 {
1729 	sljit_u8* inst;
1730 	sljit_u8 op_eax_imm = (op_types >> 24);
1731 	sljit_u8 op_rm = (op_types >> 16) & 0xff;
1732 	sljit_u8 op_mr = (op_types >> 8) & 0xff;
1733 	sljit_u8 op_imm = op_types & 0xff;
1734 
1735 	if (dst == SLJIT_UNUSED) {
1736 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1737 		if (src2 & SLJIT_IMM) {
1738 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1739 		}
1740 		else {
1741 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1742 			FAIL_IF(!inst);
1743 			*inst = op_rm;
1744 		}
1745 		return SLJIT_SUCCESS;
1746 	}
1747 
1748 	if (dst == src1 && dstw == src1w) {
1749 		if (src2 & SLJIT_IMM) {
1750 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1751 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1752 #else
1753 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1754 #endif
1755 				BINARY_EAX_IMM(op_eax_imm, src2w);
1756 			}
1757 			else {
1758 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1759 			}
1760 		}
1761 		else if (FAST_IS_REG(dst)) {
1762 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1763 			FAIL_IF(!inst);
1764 			*inst = op_rm;
1765 		}
1766 		else if (FAST_IS_REG(src2)) {
1767 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1768 			FAIL_IF(!inst);
1769 			*inst = op_mr;
1770 		}
1771 		else {
1772 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1773 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1774 			FAIL_IF(!inst);
1775 			*inst = op_mr;
1776 		}
1777 		return SLJIT_SUCCESS;
1778 	}
1779 
1780 	/* General version. */
1781 	if (FAST_IS_REG(dst) && dst != src2) {
1782 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1783 		if (src2 & SLJIT_IMM) {
1784 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1785 		}
1786 		else {
1787 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1788 			FAIL_IF(!inst);
1789 			*inst = op_rm;
1790 		}
1791 	}
1792 	else {
1793 		/* This version requires less memory writing. */
1794 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1795 		if (src2 & SLJIT_IMM) {
1796 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1797 		}
1798 		else {
1799 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1800 			FAIL_IF(!inst);
1801 			*inst = op_rm;
1802 		}
1803 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1804 	}
1805 
1806 	return SLJIT_SUCCESS;
1807 }
1808 
1809 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1810 	sljit_s32 dst, sljit_sw dstw,
1811 	sljit_s32 src1, sljit_sw src1w,
1812 	sljit_s32 src2, sljit_sw src2w)
1813 {
1814 	sljit_u8* inst;
1815 	sljit_s32 dst_r;
1816 
1817 	dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
1818 
1819 	/* Register destination. */
1820 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1821 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1822 		FAIL_IF(!inst);
1823 		*inst++ = GROUP_0F;
1824 		*inst = IMUL_r_rm;
1825 	}
1826 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1827 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1828 		FAIL_IF(!inst);
1829 		*inst++ = GROUP_0F;
1830 		*inst = IMUL_r_rm;
1831 	}
1832 	else if (src1 & SLJIT_IMM) {
1833 		if (src2 & SLJIT_IMM) {
1834 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1835 			src2 = dst_r;
1836 			src2w = 0;
1837 		}
1838 
1839 		if (src1w <= 127 && src1w >= -128) {
1840 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1841 			FAIL_IF(!inst);
1842 			*inst = IMUL_r_rm_i8;
1843 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1844 			FAIL_IF(!inst);
1845 			INC_SIZE(1);
1846 			*inst = (sljit_s8)src1w;
1847 		}
1848 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1849 		else {
1850 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1851 			FAIL_IF(!inst);
1852 			*inst = IMUL_r_rm_i32;
1853 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1854 			FAIL_IF(!inst);
1855 			INC_SIZE(4);
1856 			sljit_unaligned_store_sw(inst, src1w);
1857 		}
1858 #else
1859 		else if (IS_HALFWORD(src1w)) {
1860 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1861 			FAIL_IF(!inst);
1862 			*inst = IMUL_r_rm_i32;
1863 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1864 			FAIL_IF(!inst);
1865 			INC_SIZE(4);
1866 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1867 		}
1868 		else {
1869 			if (dst_r != src2)
1870 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1871 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1872 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1873 			FAIL_IF(!inst);
1874 			*inst++ = GROUP_0F;
1875 			*inst = IMUL_r_rm;
1876 		}
1877 #endif
1878 	}
1879 	else if (src2 & SLJIT_IMM) {
1880 		/* Note: src1 is NOT immediate. */
1881 
1882 		if (src2w <= 127 && src2w >= -128) {
1883 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1884 			FAIL_IF(!inst);
1885 			*inst = IMUL_r_rm_i8;
1886 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1887 			FAIL_IF(!inst);
1888 			INC_SIZE(1);
1889 			*inst = (sljit_s8)src2w;
1890 		}
1891 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1892 		else {
1893 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1894 			FAIL_IF(!inst);
1895 			*inst = IMUL_r_rm_i32;
1896 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1897 			FAIL_IF(!inst);
1898 			INC_SIZE(4);
1899 			sljit_unaligned_store_sw(inst, src2w);
1900 		}
1901 #else
1902 		else if (IS_HALFWORD(src2w)) {
1903 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1904 			FAIL_IF(!inst);
1905 			*inst = IMUL_r_rm_i32;
1906 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1907 			FAIL_IF(!inst);
1908 			INC_SIZE(4);
1909 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1910 		}
1911 		else {
1912 			if (dst_r != src1)
1913 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1914 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1915 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1916 			FAIL_IF(!inst);
1917 			*inst++ = GROUP_0F;
1918 			*inst = IMUL_r_rm;
1919 		}
1920 #endif
1921 	}
1922 	else {
1923 		/* Neither argument is immediate. */
1924 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1925 			dst_r = TMP_REG1;
1926 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1927 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1928 		FAIL_IF(!inst);
1929 		*inst++ = GROUP_0F;
1930 		*inst = IMUL_r_rm;
1931 	}
1932 
1933 	if (dst & SLJIT_MEM)
1934 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1935 
1936 	return SLJIT_SUCCESS;
1937 }
1938 
1939 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
1940 	sljit_s32 dst, sljit_sw dstw,
1941 	sljit_s32 src1, sljit_sw src1w,
1942 	sljit_s32 src2, sljit_sw src2w)
1943 {
1944 	sljit_u8* inst;
1945 	sljit_s32 dst_r, done = 0;
1946 
1947 	/* These cases better be left to handled by normal way. */
1948 	if (dst == src1 && dstw == src1w)
1949 		return SLJIT_ERR_UNSUPPORTED;
1950 	if (dst == src2 && dstw == src2w)
1951 		return SLJIT_ERR_UNSUPPORTED;
1952 
1953 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1954 
1955 	if (FAST_IS_REG(src1)) {
1956 		if (FAST_IS_REG(src2)) {
1957 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1958 			FAIL_IF(!inst);
1959 			*inst = LEA_r_m;
1960 			done = 1;
1961 		}
1962 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1963 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1964 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1965 #else
1966 		if (src2 & SLJIT_IMM) {
1967 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1968 #endif
1969 			FAIL_IF(!inst);
1970 			*inst = LEA_r_m;
1971 			done = 1;
1972 		}
1973 	}
1974 	else if (FAST_IS_REG(src2)) {
1975 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1976 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1977 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1978 #else
1979 		if (src1 & SLJIT_IMM) {
1980 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1981 #endif
1982 			FAIL_IF(!inst);
1983 			*inst = LEA_r_m;
1984 			done = 1;
1985 		}
1986 	}
1987 
1988 	if (done) {
1989 		if (dst_r == TMP_REG1)
1990 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1991 		return SLJIT_SUCCESS;
1992 	}
1993 	return SLJIT_ERR_UNSUPPORTED;
1994 }
1995 
1996 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1997 	sljit_s32 src1, sljit_sw src1w,
1998 	sljit_s32 src2, sljit_sw src2w)
1999 {
2000 	sljit_u8* inst;
2001 
2002 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2003 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2004 #else
2005 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
2006 #endif
2007 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
2008 		return SLJIT_SUCCESS;
2009 	}
2010 
2011 	if (FAST_IS_REG(src1)) {
2012 		if (src2 & SLJIT_IMM) {
2013 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
2014 		}
2015 		else {
2016 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2017 			FAIL_IF(!inst);
2018 			*inst = CMP_r_rm;
2019 		}
2020 		return SLJIT_SUCCESS;
2021 	}
2022 
2023 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
2024 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2025 		FAIL_IF(!inst);
2026 		*inst = CMP_rm_r;
2027 		return SLJIT_SUCCESS;
2028 	}
2029 
2030 	if (src2 & SLJIT_IMM) {
2031 		if (src1 & SLJIT_IMM) {
2032 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2033 			src1 = TMP_REG1;
2034 			src1w = 0;
2035 		}
2036 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
2037 	}
2038 	else {
2039 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2040 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2041 		FAIL_IF(!inst);
2042 		*inst = CMP_r_rm;
2043 	}
2044 	return SLJIT_SUCCESS;
2045 }
2046 
2047 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
2048 	sljit_s32 src1, sljit_sw src1w,
2049 	sljit_s32 src2, sljit_sw src2w)
2050 {
2051 	sljit_u8* inst;
2052 
2053 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2054 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2055 #else
2056 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
2057 #endif
2058 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
2059 		return SLJIT_SUCCESS;
2060 	}
2061 
2062 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2063 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2064 #else
2065 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
2066 #endif
2067 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
2068 		return SLJIT_SUCCESS;
2069 	}
2070 
2071 	if (!(src1 & SLJIT_IMM)) {
2072 		if (src2 & SLJIT_IMM) {
2073 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2074 			if (IS_HALFWORD(src2w) || compiler->mode32) {
2075 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2076 				FAIL_IF(!inst);
2077 				*inst = GROUP_F7;
2078 			}
2079 			else {
2080 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src2w));
2081 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src1, src1w);
2082 				FAIL_IF(!inst);
2083 				*inst = TEST_rm_r;
2084 			}
2085 #else
2086 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2087 			FAIL_IF(!inst);
2088 			*inst = GROUP_F7;
2089 #endif
2090 			return SLJIT_SUCCESS;
2091 		}
2092 		else if (FAST_IS_REG(src1)) {
2093 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2094 			FAIL_IF(!inst);
2095 			*inst = TEST_rm_r;
2096 			return SLJIT_SUCCESS;
2097 		}
2098 	}
2099 
2100 	if (!(src2 & SLJIT_IMM)) {
2101 		if (src1 & SLJIT_IMM) {
2102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2103 			if (IS_HALFWORD(src1w) || compiler->mode32) {
2104 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
2105 				FAIL_IF(!inst);
2106 				*inst = GROUP_F7;
2107 			}
2108 			else {
2109 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
2110 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2111 				FAIL_IF(!inst);
2112 				*inst = TEST_rm_r;
2113 			}
2114 #else
2115 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
2116 			FAIL_IF(!inst);
2117 			*inst = GROUP_F7;
2118 #endif
2119 			return SLJIT_SUCCESS;
2120 		}
2121 		else if (FAST_IS_REG(src2)) {
2122 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2123 			FAIL_IF(!inst);
2124 			*inst = TEST_rm_r;
2125 			return SLJIT_SUCCESS;
2126 		}
2127 	}
2128 
2129 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2130 	if (src2 & SLJIT_IMM) {
2131 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2132 		if (IS_HALFWORD(src2w) || compiler->mode32) {
2133 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2134 			FAIL_IF(!inst);
2135 			*inst = GROUP_F7;
2136 		}
2137 		else {
2138 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2139 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2140 			FAIL_IF(!inst);
2141 			*inst = TEST_rm_r;
2142 		}
2143 #else
2144 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2145 		FAIL_IF(!inst);
2146 		*inst = GROUP_F7;
2147 #endif
2148 	}
2149 	else {
2150 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2151 		FAIL_IF(!inst);
2152 		*inst = TEST_rm_r;
2153 	}
2154 	return SLJIT_SUCCESS;
2155 }
2156 
2157 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2158 	sljit_u8 mode,
2159 	sljit_s32 dst, sljit_sw dstw,
2160 	sljit_s32 src1, sljit_sw src1w,
2161 	sljit_s32 src2, sljit_sw src2w)
2162 {
2163 	sljit_u8* inst;
2164 
2165 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2166 		if (dst == src1 && dstw == src1w) {
2167 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2168 			FAIL_IF(!inst);
2169 			*inst |= mode;
2170 			return SLJIT_SUCCESS;
2171 		}
2172 		if (dst == SLJIT_UNUSED) {
2173 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2174 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2175 			FAIL_IF(!inst);
2176 			*inst |= mode;
2177 			return SLJIT_SUCCESS;
2178 		}
2179 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2180 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2181 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2182 			FAIL_IF(!inst);
2183 			*inst |= mode;
2184 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2185 			return SLJIT_SUCCESS;
2186 		}
2187 		if (FAST_IS_REG(dst)) {
2188 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2189 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2190 			FAIL_IF(!inst);
2191 			*inst |= mode;
2192 			return SLJIT_SUCCESS;
2193 		}
2194 
2195 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2196 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2197 		FAIL_IF(!inst);
2198 		*inst |= mode;
2199 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2200 		return SLJIT_SUCCESS;
2201 	}
2202 
2203 	if (dst == SLJIT_PREF_SHIFT_REG) {
2204 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2205 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2206 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2207 		FAIL_IF(!inst);
2208 		*inst |= mode;
2209 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2210 	}
2211 	else if (SLOW_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2212 		if (src1 != dst)
2213 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2214 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2215 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2216 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2217 		FAIL_IF(!inst);
2218 		*inst |= mode;
2219 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2220 	}
2221 	else {
2222 		/* This case is complex since ecx itself may be used for
2223 		   addressing, and this case must be supported as well. */
2224 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2225 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2226 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2227 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2228 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2229 		FAIL_IF(!inst);
2230 		*inst |= mode;
2231 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2232 #else
2233 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2234 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2235 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2236 		FAIL_IF(!inst);
2237 		*inst |= mode;
2238 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2239 #endif
2240 		if (dst != SLJIT_UNUSED)
2241 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2242 	}
2243 
2244 	return SLJIT_SUCCESS;
2245 }
2246 
2247 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2248 	sljit_u8 mode, sljit_s32 set_flags,
2249 	sljit_s32 dst, sljit_sw dstw,
2250 	sljit_s32 src1, sljit_sw src1w,
2251 	sljit_s32 src2, sljit_sw src2w)
2252 {
2253 	/* The CPU does not set flags if the shift count is 0. */
2254 	if (src2 & SLJIT_IMM) {
2255 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2256 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2257 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2258 #else
2259 		if ((src2w & 0x1f) != 0)
2260 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2261 #endif
2262 		if (!set_flags)
2263 			return emit_mov(compiler, dst, dstw, src1, src1w);
2264 		/* OR dst, src, 0 */
2265 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2266 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2267 	}
2268 
2269 	if (!set_flags)
2270 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2271 
2272 	if (!FAST_IS_REG(dst))
2273 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2274 
2275 	FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2276 
2277 	if (FAST_IS_REG(dst))
2278 		return emit_cmp_binary(compiler, (dst == SLJIT_UNUSED) ? TMP_REG1 : dst, dstw, SLJIT_IMM, 0);
2279 	return SLJIT_SUCCESS;
2280 }
2281 
2282 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2283 	sljit_s32 dst, sljit_sw dstw,
2284 	sljit_s32 src1, sljit_sw src1w,
2285 	sljit_s32 src2, sljit_sw src2w)
2286 {
2287 	CHECK_ERROR();
2288 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2289 	ADJUST_LOCAL_OFFSET(dst, dstw);
2290 	ADJUST_LOCAL_OFFSET(src1, src1w);
2291 	ADJUST_LOCAL_OFFSET(src2, src2w);
2292 
2293 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2294 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2295 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2296 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2297 	compiler->mode32 = op & SLJIT_I32_OP;
2298 #endif
2299 
2300 	if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
2301 		return SLJIT_SUCCESS;
2302 
2303 	switch (GET_OPCODE(op)) {
2304 	case SLJIT_ADD:
2305 		if (!HAS_FLAGS(op)) {
2306 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2307 				return compiler->error;
2308 		}
2309 		return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2310 			dst, dstw, src1, src1w, src2, src2w);
2311 	case SLJIT_ADDC:
2312 		return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2313 			dst, dstw, src1, src1w, src2, src2w);
2314 	case SLJIT_SUB:
2315 		if (!HAS_FLAGS(op)) {
2316 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2317 				return compiler->error;
2318 			if (SLOW_IS_REG(dst) && src2 == dst) {
2319 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
2320 				return emit_unary(compiler, NEG_rm, dst, 0, dst, 0);
2321 			}
2322 		}
2323 
2324 		if (dst == SLJIT_UNUSED)
2325 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2326 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2327 			dst, dstw, src1, src1w, src2, src2w);
2328 	case SLJIT_SUBC:
2329 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2330 			dst, dstw, src1, src1w, src2, src2w);
2331 	case SLJIT_MUL:
2332 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2333 	case SLJIT_AND:
2334 		if (dst == SLJIT_UNUSED)
2335 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2336 		return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2337 			dst, dstw, src1, src1w, src2, src2w);
2338 	case SLJIT_OR:
2339 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2340 			dst, dstw, src1, src1w, src2, src2w);
2341 	case SLJIT_XOR:
2342 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2343 			dst, dstw, src1, src1w, src2, src2w);
2344 	case SLJIT_SHL:
2345 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2346 			dst, dstw, src1, src1w, src2, src2w);
2347 	case SLJIT_LSHR:
2348 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2349 			dst, dstw, src1, src1w, src2, src2w);
2350 	case SLJIT_ASHR:
2351 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2352 			dst, dstw, src1, src1w, src2, src2w);
2353 	}
2354 
2355 	return SLJIT_SUCCESS;
2356 }
2357 
2358 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
2359 	sljit_s32 src, sljit_sw srcw)
2360 {
2361 	CHECK_ERROR();
2362 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
2363 	ADJUST_LOCAL_OFFSET(src, srcw);
2364 
2365 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2366 
2367 	switch (op) {
2368 	case SLJIT_FAST_RETURN:
2369 		return emit_fast_return(compiler, src, srcw);
2370 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
2371 		/* Don't adjust shadow stack if it isn't enabled.  */
2372 		if (!cpu_has_shadow_stack ())
2373 			return SLJIT_SUCCESS;
2374 		return adjust_shadow_stack(compiler, src, srcw, SLJIT_UNUSED, 0);
2375 	case SLJIT_PREFETCH_L1:
2376 	case SLJIT_PREFETCH_L2:
2377 	case SLJIT_PREFETCH_L3:
2378 	case SLJIT_PREFETCH_ONCE:
2379 		return emit_prefetch(compiler, op, src, srcw);
2380 	}
2381 
2382 	return SLJIT_SUCCESS;
2383 }
2384 
2385 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2386 {
2387 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2388 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2389 	if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2390 		return -1;
2391 #endif
2392 	return reg_map[reg];
2393 }
2394 
2395 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2396 {
2397 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2398 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2399 	return reg;
2400 #else
2401 	return freg_map[reg];
2402 #endif
2403 }
2404 
2405 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2406 	void *instruction, sljit_s32 size)
2407 {
2408 	sljit_u8 *inst;
2409 
2410 	CHECK_ERROR();
2411 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2412 
2413 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2414 	FAIL_IF(!inst);
2415 	INC_SIZE(size);
2416 	SLJIT_MEMCPY(inst, instruction, size);
2417 	return SLJIT_SUCCESS;
2418 }
2419 
2420 /* --------------------------------------------------------------------- */
2421 /*  Floating point operators                                             */
2422 /* --------------------------------------------------------------------- */
2423 
2424 /* Alignment(3) + 4 * 16 bytes. */
2425 static sljit_s32 sse2_data[3 + (4 * 4)];
2426 static sljit_s32 *sse2_buffer;
2427 
2428 static void init_compiler(void)
2429 {
2430 	/* Align to 16 bytes. */
2431 	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2432 
2433 	/* Single precision constants (each constant is 16 byte long). */
2434 	sse2_buffer[0] = 0x80000000;
2435 	sse2_buffer[4] = 0x7fffffff;
2436 	/* Double precision constants (each constant is 16 byte long). */
2437 	sse2_buffer[8] = 0;
2438 	sse2_buffer[9] = 0x80000000;
2439 	sse2_buffer[12] = 0xffffffff;
2440 	sse2_buffer[13] = 0x7fffffff;
2441 }
2442 
2443 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2444 	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2445 {
2446 	sljit_u8 *inst;
2447 
2448 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2449 	FAIL_IF(!inst);
2450 	*inst++ = GROUP_0F;
2451 	*inst = opcode;
2452 	return SLJIT_SUCCESS;
2453 }
2454 
2455 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2456 	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2457 {
2458 	sljit_u8 *inst;
2459 
2460 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2461 	FAIL_IF(!inst);
2462 	*inst++ = GROUP_0F;
2463 	*inst = opcode;
2464 	return SLJIT_SUCCESS;
2465 }
2466 
2467 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2468 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2469 {
2470 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2471 }
2472 
2473 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2474 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2475 {
2476 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2477 }
2478 
2479 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2480 	sljit_s32 dst, sljit_sw dstw,
2481 	sljit_s32 src, sljit_sw srcw)
2482 {
2483 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2484 	sljit_u8 *inst;
2485 
2486 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2487 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2488 		compiler->mode32 = 0;
2489 #endif
2490 
2491 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2492 	FAIL_IF(!inst);
2493 	*inst++ = GROUP_0F;
2494 	*inst = CVTTSD2SI_r_xm;
2495 
2496 	if (dst & SLJIT_MEM)
2497 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2498 	return SLJIT_SUCCESS;
2499 }
2500 
2501 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2502 	sljit_s32 dst, sljit_sw dstw,
2503 	sljit_s32 src, sljit_sw srcw)
2504 {
2505 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2506 	sljit_u8 *inst;
2507 
2508 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2509 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2510 		compiler->mode32 = 0;
2511 #endif
2512 
2513 	if (src & SLJIT_IMM) {
2514 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2515 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2516 			srcw = (sljit_s32)srcw;
2517 #endif
2518 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2519 		src = TMP_REG1;
2520 		srcw = 0;
2521 	}
2522 
2523 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2524 	FAIL_IF(!inst);
2525 	*inst++ = GROUP_0F;
2526 	*inst = CVTSI2SD_x_rm;
2527 
2528 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2529 	compiler->mode32 = 1;
2530 #endif
2531 	if (dst_r == TMP_FREG)
2532 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2533 	return SLJIT_SUCCESS;
2534 }
2535 
2536 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2537 	sljit_s32 src1, sljit_sw src1w,
2538 	sljit_s32 src2, sljit_sw src2w)
2539 {
2540 	if (!FAST_IS_REG(src1)) {
2541 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2542 		src1 = TMP_FREG;
2543 	}
2544 
2545 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2546 }
2547 
2548 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2549 	sljit_s32 dst, sljit_sw dstw,
2550 	sljit_s32 src, sljit_sw srcw)
2551 {
2552 	sljit_s32 dst_r;
2553 
2554 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2555 	compiler->mode32 = 1;
2556 #endif
2557 
2558 	CHECK_ERROR();
2559 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2560 
2561 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2562 		if (FAST_IS_REG(dst))
2563 			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2564 		if (FAST_IS_REG(src))
2565 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2566 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2567 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2568 	}
2569 
2570 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2571 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2572 		if (FAST_IS_REG(src)) {
2573 			/* We overwrite the high bits of source. From SLJIT point of view,
2574 			   this is not an issue.
2575 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2576 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2577 		}
2578 		else {
2579 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2580 			src = TMP_FREG;
2581 		}
2582 
2583 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2584 		if (dst_r == TMP_FREG)
2585 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2586 		return SLJIT_SUCCESS;
2587 	}
2588 
2589 	if (FAST_IS_REG(dst)) {
2590 		dst_r = dst;
2591 		if (dst != src)
2592 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2593 	}
2594 	else {
2595 		dst_r = TMP_FREG;
2596 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2597 	}
2598 
2599 	switch (GET_OPCODE(op)) {
2600 	case SLJIT_NEG_F64:
2601 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2602 		break;
2603 
2604 	case SLJIT_ABS_F64:
2605 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2606 		break;
2607 	}
2608 
2609 	if (dst_r == TMP_FREG)
2610 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2611 	return SLJIT_SUCCESS;
2612 }
2613 
2614 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2615 	sljit_s32 dst, sljit_sw dstw,
2616 	sljit_s32 src1, sljit_sw src1w,
2617 	sljit_s32 src2, sljit_sw src2w)
2618 {
2619 	sljit_s32 dst_r;
2620 
2621 	CHECK_ERROR();
2622 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2623 	ADJUST_LOCAL_OFFSET(dst, dstw);
2624 	ADJUST_LOCAL_OFFSET(src1, src1w);
2625 	ADJUST_LOCAL_OFFSET(src2, src2w);
2626 
2627 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2628 	compiler->mode32 = 1;
2629 #endif
2630 
2631 	if (FAST_IS_REG(dst)) {
2632 		dst_r = dst;
2633 		if (dst == src1)
2634 			; /* Do nothing here. */
2635 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2636 			/* Swap arguments. */
2637 			src2 = src1;
2638 			src2w = src1w;
2639 		}
2640 		else if (dst != src2)
2641 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2642 		else {
2643 			dst_r = TMP_FREG;
2644 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2645 		}
2646 	}
2647 	else {
2648 		dst_r = TMP_FREG;
2649 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2650 	}
2651 
2652 	switch (GET_OPCODE(op)) {
2653 	case SLJIT_ADD_F64:
2654 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2655 		break;
2656 
2657 	case SLJIT_SUB_F64:
2658 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2659 		break;
2660 
2661 	case SLJIT_MUL_F64:
2662 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2663 		break;
2664 
2665 	case SLJIT_DIV_F64:
2666 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2667 		break;
2668 	}
2669 
2670 	if (dst_r == TMP_FREG)
2671 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2672 	return SLJIT_SUCCESS;
2673 }
2674 
2675 /* --------------------------------------------------------------------- */
2676 /*  Conditional instructions                                             */
2677 /* --------------------------------------------------------------------- */
2678 
2679 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2680 {
2681 	sljit_u8 *inst;
2682 	struct sljit_label *label;
2683 
2684 	CHECK_ERROR_PTR();
2685 	CHECK_PTR(check_sljit_emit_label(compiler));
2686 
2687 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2688 		return compiler->last_label;
2689 
2690 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2691 	PTR_FAIL_IF(!label);
2692 	set_label(label, compiler);
2693 
2694 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2695 	PTR_FAIL_IF(!inst);
2696 
2697 	*inst++ = 0;
2698 	*inst++ = 0;
2699 
2700 	return label;
2701 }
2702 
2703 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2704 {
2705 	sljit_u8 *inst;
2706 	struct sljit_jump *jump;
2707 
2708 	CHECK_ERROR_PTR();
2709 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2710 
2711 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2712 	PTR_FAIL_IF_NULL(jump);
2713 	set_jump(jump, compiler, (type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT));
2714 	type &= 0xff;
2715 
2716 	/* Worst case size. */
2717 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2718 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2719 #else
2720 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2721 #endif
2722 
2723 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2724 	PTR_FAIL_IF_NULL(inst);
2725 
2726 	*inst++ = 0;
2727 	*inst++ = 1;
2728 	return jump;
2729 }
2730 
2731 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2732 {
2733 	sljit_u8 *inst;
2734 	struct sljit_jump *jump;
2735 
2736 	CHECK_ERROR();
2737 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2738 	ADJUST_LOCAL_OFFSET(src, srcw);
2739 
2740 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2741 
2742 	if (src == SLJIT_IMM) {
2743 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2744 		FAIL_IF_NULL(jump);
2745 		set_jump(jump, compiler, JUMP_ADDR | (type << TYPE_SHIFT));
2746 		jump->u.target = srcw;
2747 
2748 		/* Worst case size. */
2749 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2750 		compiler->size += 5;
2751 #else
2752 		compiler->size += 10 + 3;
2753 #endif
2754 
2755 		inst = (sljit_u8*)ensure_buf(compiler, 2);
2756 		FAIL_IF_NULL(inst);
2757 
2758 		*inst++ = 0;
2759 		*inst++ = 1;
2760 	}
2761 	else {
2762 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2763 		/* REX_W is not necessary (src is not immediate). */
2764 		compiler->mode32 = 1;
2765 #endif
2766 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2767 		FAIL_IF(!inst);
2768 		*inst++ = GROUP_FF;
2769 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2770 	}
2771 	return SLJIT_SUCCESS;
2772 }
2773 
2774 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2775 	sljit_s32 dst, sljit_sw dstw,
2776 	sljit_s32 type)
2777 {
2778 	sljit_u8 *inst;
2779 	sljit_u8 cond_set = 0;
2780 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2781 	sljit_s32 reg;
2782 #endif
2783 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
2784 	sljit_s32 dst_save = dst;
2785 	sljit_sw dstw_save = dstw;
2786 
2787 	CHECK_ERROR();
2788 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
2789 
2790 	ADJUST_LOCAL_OFFSET(dst, dstw);
2791 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2792 
2793 	type &= 0xff;
2794 	/* setcc = jcc + 0x10. */
2795 	cond_set = get_jump_code(type) + 0x10;
2796 
2797 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2798 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
2799 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2800 		FAIL_IF(!inst);
2801 		INC_SIZE(4 + 3);
2802 		/* Set low register to conditional flag. */
2803 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2804 		*inst++ = GROUP_0F;
2805 		*inst++ = cond_set;
2806 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2807 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2808 		*inst++ = OR_rm8_r8;
2809 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2810 		return SLJIT_SUCCESS;
2811 	}
2812 
2813 	reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2814 
2815 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2816 	FAIL_IF(!inst);
2817 	INC_SIZE(4 + 4);
2818 	/* Set low register to conditional flag. */
2819 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2820 	*inst++ = GROUP_0F;
2821 	*inst++ = cond_set;
2822 	*inst++ = MOD_REG | reg_lmap[reg];
2823 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2824 	/* The movzx instruction does not affect flags. */
2825 	*inst++ = GROUP_0F;
2826 	*inst++ = MOVZX_r_rm8;
2827 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2828 
2829 	if (reg != TMP_REG1)
2830 		return SLJIT_SUCCESS;
2831 
2832 	if (GET_OPCODE(op) < SLJIT_ADD) {
2833 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2834 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2835 	}
2836 
2837 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2838 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2839 	compiler->skip_checks = 1;
2840 #endif
2841 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2842 
2843 #else
2844 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
2845 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2846 		if (reg_map[dst] <= 4) {
2847 			/* Low byte is accessible. */
2848 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2849 			FAIL_IF(!inst);
2850 			INC_SIZE(3 + 3);
2851 			/* Set low byte to conditional flag. */
2852 			*inst++ = GROUP_0F;
2853 			*inst++ = cond_set;
2854 			*inst++ = MOD_REG | reg_map[dst];
2855 
2856 			*inst++ = GROUP_0F;
2857 			*inst++ = MOVZX_r_rm8;
2858 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2859 			return SLJIT_SUCCESS;
2860 		}
2861 
2862 		/* Low byte is not accessible. */
2863 		if (cpu_has_cmov == -1)
2864 			get_cpu_features();
2865 
2866 		if (cpu_has_cmov) {
2867 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2868 			/* a xor reg, reg operation would overwrite the flags. */
2869 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2870 
2871 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2872 			FAIL_IF(!inst);
2873 			INC_SIZE(3);
2874 
2875 			*inst++ = GROUP_0F;
2876 			/* cmovcc = setcc - 0x50. */
2877 			*inst++ = cond_set - 0x50;
2878 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2879 			return SLJIT_SUCCESS;
2880 		}
2881 
2882 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2883 		FAIL_IF(!inst);
2884 		INC_SIZE(1 + 3 + 3 + 1);
2885 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2886 		/* Set al to conditional flag. */
2887 		*inst++ = GROUP_0F;
2888 		*inst++ = cond_set;
2889 		*inst++ = MOD_REG | 0 /* eax */;
2890 
2891 		*inst++ = GROUP_0F;
2892 		*inst++ = MOVZX_r_rm8;
2893 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2894 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2895 		return SLJIT_SUCCESS;
2896 	}
2897 
2898 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
2899 		SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
2900 
2901 		if (dst != SLJIT_R0) {
2902 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2903 			FAIL_IF(!inst);
2904 			INC_SIZE(1 + 3 + 2 + 1);
2905 			/* Set low register to conditional flag. */
2906 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2907 			*inst++ = GROUP_0F;
2908 			*inst++ = cond_set;
2909 			*inst++ = MOD_REG | 0 /* eax */;
2910 			*inst++ = OR_rm8_r8;
2911 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2912 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2913 		}
2914 		else {
2915 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2916 			FAIL_IF(!inst);
2917 			INC_SIZE(2 + 3 + 2 + 2);
2918 			/* Set low register to conditional flag. */
2919 			*inst++ = XCHG_r_rm;
2920 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2921 			*inst++ = GROUP_0F;
2922 			*inst++ = cond_set;
2923 			*inst++ = MOD_REG | 1 /* ecx */;
2924 			*inst++ = OR_rm8_r8;
2925 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2926 			*inst++ = XCHG_r_rm;
2927 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2928 		}
2929 		return SLJIT_SUCCESS;
2930 	}
2931 
2932 	/* Set TMP_REG1 to the bit. */
2933 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2934 	FAIL_IF(!inst);
2935 	INC_SIZE(1 + 3 + 3 + 1);
2936 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2937 	/* Set al to conditional flag. */
2938 	*inst++ = GROUP_0F;
2939 	*inst++ = cond_set;
2940 	*inst++ = MOD_REG | 0 /* eax */;
2941 
2942 	*inst++ = GROUP_0F;
2943 	*inst++ = MOVZX_r_rm8;
2944 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2945 
2946 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2947 
2948 	if (GET_OPCODE(op) < SLJIT_ADD)
2949 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2950 
2951 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2952 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2953 	compiler->skip_checks = 1;
2954 #endif
2955 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2956 #endif /* SLJIT_CONFIG_X86_64 */
2957 }
2958 
2959 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
2960 	sljit_s32 dst_reg,
2961 	sljit_s32 src, sljit_sw srcw)
2962 {
2963 	sljit_u8* inst;
2964 
2965 	CHECK_ERROR();
2966 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
2967 
2968 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2969 	dst_reg &= ~SLJIT_I32_OP;
2970 
2971 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3))
2972 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2973 #else
2974 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV))
2975 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2976 #endif
2977 
2978 	/* ADJUST_LOCAL_OFFSET is not needed. */
2979 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2980 
2981 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2982 	compiler->mode32 = dst_reg & SLJIT_I32_OP;
2983 	dst_reg &= ~SLJIT_I32_OP;
2984 #endif
2985 
2986 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2987 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2988 		src = TMP_REG1;
2989 		srcw = 0;
2990 	}
2991 
2992 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
2993 	FAIL_IF(!inst);
2994 	*inst++ = GROUP_0F;
2995 	*inst = get_jump_code(type & 0xff) - 0x40;
2996 	return SLJIT_SUCCESS;
2997 }
2998 
2999 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
3000 {
3001 	CHECK_ERROR();
3002 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
3003 	ADJUST_LOCAL_OFFSET(dst, dstw);
3004 
3005 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3006 
3007 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3008 	compiler->mode32 = 0;
3009 #endif
3010 
3011 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
3012 
3013 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3014 	if (NOT_HALFWORD(offset)) {
3015 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
3016 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
3017 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
3018 		return compiler->error;
3019 #else
3020 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
3021 #endif
3022 	}
3023 #endif
3024 
3025 	if (offset != 0)
3026 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
3027 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
3028 }
3029 
3030 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
3031 {
3032 	sljit_u8 *inst;
3033 	struct sljit_const *const_;
3034 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3035 	sljit_s32 reg;
3036 #endif
3037 
3038 	CHECK_ERROR_PTR();
3039 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
3040 	ADJUST_LOCAL_OFFSET(dst, dstw);
3041 
3042 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3043 
3044 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
3045 	PTR_FAIL_IF(!const_);
3046 	set_const(const_, compiler);
3047 
3048 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3049 	compiler->mode32 = 0;
3050 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
3051 
3052 	if (emit_load_imm64(compiler, reg, init_value))
3053 		return NULL;
3054 #else
3055 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
3056 		return NULL;
3057 #endif
3058 
3059 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3060 	PTR_FAIL_IF(!inst);
3061 
3062 	*inst++ = 0;
3063 	*inst++ = 2;
3064 
3065 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3066 	if (dst & SLJIT_MEM)
3067 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
3068 			return NULL;
3069 #endif
3070 
3071 	return const_;
3072 }
3073 
3074 SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
3075 {
3076 	struct sljit_put_label *put_label;
3077 	sljit_u8 *inst;
3078 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3079 	sljit_s32 reg;
3080 	sljit_uw start_size;
3081 #endif
3082 
3083 	CHECK_ERROR_PTR();
3084 	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
3085 	ADJUST_LOCAL_OFFSET(dst, dstw);
3086 
3087 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3088 
3089 	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
3090 	PTR_FAIL_IF(!put_label);
3091 	set_put_label(put_label, compiler, 0);
3092 
3093 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3094 	compiler->mode32 = 0;
3095 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
3096 
3097 	if (emit_load_imm64(compiler, reg, 0))
3098 		return NULL;
3099 #else
3100 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, 0))
3101 		return NULL;
3102 #endif
3103 
3104 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3105 	if (dst & SLJIT_MEM) {
3106 		start_size = compiler->size;
3107 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
3108 			return NULL;
3109 		put_label->flags = compiler->size - start_size;
3110 	}
3111 #endif
3112 
3113 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3114 	PTR_FAIL_IF(!inst);
3115 
3116 	*inst++ = 0;
3117 	*inst++ = 3;
3118 
3119 	return put_label;
3120 }
3121 
3122 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
3123 {
3124 	SLJIT_UNUSED_ARG(executable_offset);
3125 
3126 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0);
3127 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3128 	sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset);
3129 #else
3130 	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target);
3131 #endif
3132 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1);
3133 }
3134 
3135 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
3136 {
3137 	SLJIT_UNUSED_ARG(executable_offset);
3138 
3139 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 0);
3140 	sljit_unaligned_store_sw((void*)addr, new_constant);
3141 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 1);
3142 }
3143