1 /*
2  * compiler/codegen_x86.cpp - IA-32 and AMD64 code generator
3  *
4  * Copyright (c) 2001-2004 Milan Jurik of ARAnyM dev team (see AUTHORS)
5  *
6  * Inspired by Christian Bauer's Basilisk II
7  *
8  * This file is part of the ARAnyM project which builds a new and powerful
9  * TOS/FreeMiNT compatible virtual machine running on almost any hardware.
10  *
11  * JIT compiler m68k -> IA-32 and AMD64
12  *
13  * Original 68040 JIT compiler for UAE, copyright 2000-2002 Bernd Meyer
14  * Adaptation for Basilisk II and improvements, copyright 2000-2004 Gwenole Beauchesne
15  * Portions related to CPU detection come from linux/arch/i386/kernel/setup.c
16  *
17  * This program is free software; you can redistribute it and/or modify
18  * it under the terms of the GNU General Public License as published by
19  * the Free Software Foundation; either version 2 of the License, or
20  * (at your option) any later version.
21  *
22  * This program is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25  * GNU General Public License for more details.
26  *
27  * You should have received a copy of the GNU General Public License
28  * along with this program; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30  */
31 
32 /* This should eventually end up in machdep/, but for now, x86 is the
33 only target, and it's easier this way... */
34 
35 #include "flags_x86.h"
36 
37 /*************************************************************************
38 * Some basic information about the the target CPU                       *
39 *************************************************************************/
40 
41 #define R1 RR1
42 #define R2 RR2
43 #define R4 RR4
44 
45 #define EAX_INDEX 0
46 #define ECX_INDEX 1
47 #define EDX_INDEX 2
48 #define EBX_INDEX 3
49 #define ESP_INDEX 4
50 #define EBP_INDEX 5
51 #define ESI_INDEX 6
52 #define EDI_INDEX 7
53 #if defined(CPU_x86_64)
54 #define R8_INDEX  8
55 #define R9_INDEX  9
56 #define R10_INDEX 10
57 #define R11_INDEX 11
58 #define R12_INDEX 12
59 #define R13_INDEX 13
60 #define R14_INDEX 14
61 #define R15_INDEX 15
62 #endif
63 /* XXX this has to match X86_Reg8H_Base + 4 */
64 #define AH_INDEX (0x10+4+EAX_INDEX)
65 #define CH_INDEX (0x10+4+ECX_INDEX)
66 #define DH_INDEX (0x10+4+EDX_INDEX)
67 #define BH_INDEX (0x10+4+EBX_INDEX)
68 
69 /* The register in which subroutines return an integer return value */
70 #define REG_RESULT EAX_INDEX
71 
72 /* The registers subroutines take their first and second argument in */
73 #ifdef _WIN32
74 /* Handle the _fastcall parameters of ECX and EDX */
75 #define REG_PAR1 ECX_INDEX
76 #define REG_PAR2 EDX_INDEX
77 #elif defined(CPU_x86_64)
78 #define REG_PAR1 EDI_INDEX
79 #define REG_PAR2 ESI_INDEX
80 #else
81 #define REG_PAR1 EAX_INDEX
82 #define REG_PAR2 EDX_INDEX
83 #endif
84 
85 #define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */
86 #ifdef _WIN32
87 #define REG_PC_TMP ECX_INDEX
88 #else
89 #define REG_PC_TMP ECX_INDEX /* Another register that is not the above */
90 #endif
91 
92 #define SHIFTCOUNT_NREG ECX_INDEX  /* Register that can be used for shiftcount.
93 			      -1 if any reg will do */
94 #define MUL_NREG1 EAX_INDEX /* %eax will hold the low 32 bits after a 32x32 mul */
95 #define MUL_NREG2 EDX_INDEX /* %edx will hold the high 32 bits */
96 
97 #define STACK_ALIGN		16
98 #define STACK_OFFSET	sizeof(void *)
99 #ifdef _WIN64
100 /* In the Microsoft x64 calling convention, it's the caller's responsibility
101  * to allocate 32 bytes of "shadow space" on the stack right before calling
102  * the function (regardless of the actual number of parameters used). */
103 #define STACK_SHADOW_SPACE 32
104 #else
105 #define STACK_SHADOW_SPACE 0
106 #endif
107 
108 #if defined(CPU_x86_64)
109 /* Register R12 (and ESP) cannot be used with simple [r/m + disp32] addressing,
110  * since r/m bits 100 implies SIB byte. Simplest fix is to not use these
111  * registers. Also note that these registers are listed in the freescratch
112  * function as well. */
113 uae_s8 always_used[] = { 4, 12, -1 };
114 uae_s8 can_byte[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1};
115 uae_s8 can_word[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1};
116 #else
117 uae_s8 always_used[] = { 4, -1 };
118 uae_s8 can_byte[]={0,1,2,3,-1};
119 uae_s8 can_word[]={0,1,2,3,5,6,7,-1};
120 #endif
121 
122 #if USE_OPTIMIZED_CALLS
123 /* Make sure interpretive core does not use cpuopti */
124 uae_u8 call_saved[]={0,0,0,1,1,1,1,1};
125 #error FIXME: code not ready
126 #else
127 /* cpuopti mutate instruction handlers to assume registers are saved
128    by the caller */
129 uae_u8 call_saved[]={0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0};
130 #endif
131 
132 /* This *should* be the same as call_saved. But:
133    - We might not really know which registers are saved, and which aren't,
134      so we need to preserve some, but don't want to rely on everyone else
135      also saving those registers
136    - Special registers (such like the stack pointer) should not be "preserved"
137      by pushing, even though they are "saved" across function calls
138 */
139 #if defined(CPU_x86_64)
140 #ifdef _WIN64
141 /* https://msdn.microsoft.com/en-us/library/6t169e9c.aspx:
142  * "The registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are
143  * considered nonvolatile and must be saved and restored by a function that
144  * uses them". Also saving r11 for now (see comment below). */
145 static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,1,1,0,0,0,1,1,1,1,1};
146 #else
147 /* callee-saved registers as defined by Linux AMD64 ABI: rbx, rbp, rsp, r12 - r15 */
148 /* preserve r11 because it's generally used to hold pointers to functions */
149 /* FIXME: not really sure what the point of saving r11 is (??). If functions
150  * cannot assume calle preserves it, it will not be used across calls anyway? */
151 static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,0,0,0,0,0,1,1,1,1,1};
152 #endif
153 #else
154 /* callee-saved registers as defined by System V IA-32 ABI: edi, esi, ebx, ebp */
155 static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,1,1};
156 #endif
157 
158 /* Whether classes of instructions do or don't clobber the native flags */
159 #define CLOBBER_MOV
160 #define CLOBBER_LEA
161 #define CLOBBER_CMOV
162 #define CLOBBER_POP
163 #define CLOBBER_PUSH
164 #define CLOBBER_SUB  clobber_flags()
165 #define CLOBBER_SBB  clobber_flags()
166 #define CLOBBER_CMP  clobber_flags()
167 #define CLOBBER_ADD  clobber_flags()
168 #define CLOBBER_ADC  clobber_flags()
169 #define CLOBBER_AND  clobber_flags()
170 #define CLOBBER_OR   clobber_flags()
171 #define CLOBBER_XOR  clobber_flags()
172 
173 #define CLOBBER_ROL  clobber_flags()
174 #define CLOBBER_ROR  clobber_flags()
175 #define CLOBBER_SHLL clobber_flags()
176 #define CLOBBER_SHRL clobber_flags()
177 #define CLOBBER_SHRA clobber_flags()
178 #define CLOBBER_TEST clobber_flags()
179 #define CLOBBER_CL16
180 #define CLOBBER_CL8
181 #define CLOBBER_SE32
182 #define CLOBBER_SE16
183 #define CLOBBER_SE8
184 #define CLOBBER_ZE32
185 #define CLOBBER_ZE16
186 #define CLOBBER_ZE8
187 #define CLOBBER_SW16 clobber_flags()
188 #define CLOBBER_SW32
189 #define CLOBBER_SETCC
190 #define CLOBBER_MUL  clobber_flags()
191 #define CLOBBER_BT   clobber_flags()
192 #define CLOBBER_BSF  clobber_flags()
193 
194 /* The older code generator is now deprecated.  */
195 #define USE_NEW_RTASM 1
196 
197 #if USE_NEW_RTASM
198 
199 #if defined(CPU_x86_64)
200 #define X86_TARGET_64BIT		1
201 /* The address override prefix causes a 5 cycles penalty on Intel Core
202    processors. Another solution would be to decompose the load in an LEA,
203    MOV (to zero-extend), MOV (from memory): is it better? */
204 #define ADDR32					x86_emit_byte(0x67),
205 #else
206 #define ADDR32
207 #endif
208 #define X86_FLAT_REGISTERS		0
209 #define X86_OPTIMIZE_ALU		1
210 #define X86_OPTIMIZE_ROTSHI		1
211 #include "codegen_x86.h"
212 
213 #define x86_emit_byte(B)		emit_byte(B)
214 #define x86_emit_word(W)		emit_word(W)
215 #define x86_emit_long(L)		emit_long(L)
216 #define x86_emit_quad(Q)		emit_quad(Q)
217 #define x86_get_target()		get_target()
218 #define x86_emit_failure(MSG)	jit_fail(MSG, __FILE__, __LINE__, __FUNCTION__)
219 
x86_64_addr32(void)220 static inline void x86_64_addr32(void)
221 {
222 #ifdef CPU_x86_64
223 	emit_byte(0x67);
224 #endif
225 }
226 
x86_64_rex(bool w,uae_u32 * r,uae_u32 * x,uae_u32 * b)227 static inline void x86_64_rex(bool w, uae_u32 *r, uae_u32 *x, uae_u32 *b)
228 {
229 #ifdef CPU_x86_64
230 	int rex_byte = 0x40;
231 	if (*b >= R8_INDEX) {
232 		*b -= R8_INDEX;
233 		rex_byte |= 1;
234 	}
235 	if (rex_byte != 0x40) {
236 		emit_byte(rex_byte);
237 	}
238 #endif
239 }
240 
x86_64_prefix(bool addr32,bool w,uae_u32 * r,uae_u32 * x,uae_u32 * b)241 static inline void x86_64_prefix(
242 	bool addr32, bool w, uae_u32 *r, uae_u32 *x, uae_u32 *b)
243 {
244 	if (addr32) {
245 		x86_64_addr32();
246 	}
247 	x86_64_rex(w, r, x, b);
248 }
249 
250 // Some mappings to mark compemu_support calls as only used by compemu
251 // These are still mainly x86 minded. Should be more CPU independent in the future
252 #define compemu_raw_add_l_mi(a,b)		raw_add_l_mi(a,b)
253 #define compemu_raw_and_l_ri(a,b)		raw_and_l_ri(a,b)
254 #define compemu_raw_bswap_32(a)			raw_bswap_32(a)
255 #define compemu_raw_bt_l_ri(a,b)		raw_bt_l_ri(a,b)
256 #define compemu_raw_call(a)				raw_call(a)
257 #define compemu_raw_cmov_l_rm_indexed(a,b,c,d,e)	raw_cmov_l_rm_indexed(a,b,c,d,e)
258 #define compemu_raw_cmp_l_mi(a,b)		raw_cmp_l_mi(a,b)
259 #define compemu_raw_cmp_l_mi8(a,b)		raw_cmp_l_mi(a,b)
260 #define compemu_raw_jcc_b_oponly(a)		raw_jcc_b_oponly(a)
261 #define compemu_raw_jcc_l_oponly(a)		raw_jcc_l_oponly(a)
262 #define compemu_raw_jl(a)				raw_jl(a)
263 #define compemu_raw_jmp(a)				raw_jmp(a)
264 #define compemu_raw_jmp_m_indexed(a,b,c)	raw_jmp_m_indexed(a,b,c)
265 #define compemu_raw_jmp_r(a)			raw_jmp_r(a)
266 #define compemu_raw_jnz(a)				raw_jnz(a)
267 #define compemu_raw_jz_b_oponly()		raw_jz_b_oponly()
268 #define compemu_raw_lea_l_brr(a,b,c) 	raw_lea_l_brr(a,b,c)
269 #define compemu_raw_lea_l_brr_indexed(a,b,c,d,e)	raw_lea_l_brr_indexed(a,b,c,d,e)
270 #define compemu_raw_mov_b_mr(a,b)		raw_mov_b_mr(a,b)
271 #define compemu_raw_mov_l_mi(a,b)		raw_mov_l_mi(a,b)
272 #define compemu_raw_mov_l_mr(a,b)		raw_mov_l_mr(a,b)
273 #define compemu_raw_mov_l_ri(a,b)		raw_mov_l_ri(a,b)
274 #define compemu_raw_mov_l_rm(a,b)		raw_mov_l_rm(a,b)
275 #define compemu_raw_mov_l_rr(a,b)		raw_mov_l_rr(a,b)
276 #define compemu_raw_mov_w_mr(a,b)		raw_mov_w_mr(a,b)
277 #define compemu_raw_sub_l_mi(a,b)		raw_sub_l_mi(a,b)
278 #define compemu_raw_test_l_rr(a,b) 		raw_test_l_rr(a,b)
279 #define compemu_raw_zero_extend_16_rr(a,b)	raw_zero_extend_16_rr(a,b)
280 #define compemu_raw_lea_l_rr_indexed(a,b,c,d)	raw_lea_l_rr_indexed(a,b,c,d)
281 
jit_fail(const char * msg,const char * file,int line,const char * function)282 static void jit_fail(const char *msg, const char *file, int line, const char *function)
283 {
284 	jit_abort("failure in function %s from file %s at line %d: %s",
285 			function, file, line, msg);
286 }
287 
288 LOWFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
289 {
290 #if defined(CPU_x86_64)
291 	PUSHQr(r);
292 #else
293 	PUSHLr(r);
294 #endif
295 }
296 LENDFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
297 
298 LOWFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
299 {
300 #if defined(CPU_x86_64)
301 	POPQr(r);
302 #else
303 	POPLr(r);
304 #endif
305 }
306 LENDFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
307 
308 LOWFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
309 {
310 #if defined(CPU_x86_64)
311 	POPQm(d, X86_NOREG, X86_NOREG, 1);
312 #else
313 	POPLm(d, X86_NOREG, X86_NOREG, 1);
314 #endif
315 }
316 LENDFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
317 
318 LOWFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
319 {
320 	BTLir(i, r);
321 }
322 LENDFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
323 
324 LOWFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
325 {
326 	BTLrr(b, r);
327 }
328 LENDFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
329 
330 LOWFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
331 {
332 	BTCLir(i, r);
333 }
334 LENDFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
335 
336 LOWFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
337 {
338 	BTCLrr(b, r);
339 }
340 LENDFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
341 
342 LOWFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
343 {
344 	BTRLir(i, r);
345 }
346 LENDFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
347 
348 LOWFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
349 {
350 	BTRLrr(b, r);
351 }
352 LENDFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
353 
354 LOWFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
355 {
356 	BTSLir(i, r);
357 }
358 LENDFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
359 
360 LOWFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
361 {
362 	BTSLrr(b, r);
363 }
364 LENDFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
365 
366 LOWFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
367 {
368 	SUBWir(i, d);
369 }
370 LENDFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
371 
372 LOWFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
373 {
374 	ADDR32 MOVLmr(s, X86_NOREG, X86_NOREG, 1, d);
375 }
376 LENDFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
377 
378 LOWFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
379 {
380 	ADDR32 MOVLim(s, d, X86_NOREG, X86_NOREG, 1);
381 }
382 LENDFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
383 
384 LOWFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
385 {
386 	ADDR32 MOVWim(s, d, X86_NOREG, X86_NOREG, 1);
387 }
388 LENDFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
389 
390 LOWFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
391 {
392 	ADDR32 MOVBim(s, d, X86_NOREG, X86_NOREG, 1);
393 }
394 LENDFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
395 
396 LOWFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
397 {
398 	ADDR32 ROLBim(i, d, X86_NOREG, X86_NOREG, 1);
399 }
400 LENDFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
401 
402 LOWFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
403 {
404 	ROLBir(i, r);
405 }
406 LENDFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
407 
408 LOWFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
409 {
410 	ROLWir(i, r);
411 }
412 LENDFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
413 
414 LOWFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
415 {
416 	ROLLir(i, r);
417 }
418 LENDFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
419 
420 LOWFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
421 {
422 	ROLLrr(r, d);
423 }
424 LENDFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
425 
426 LOWFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
427 {
428 	ROLWrr(r, d);
429 }
430 LENDFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
431 
432 LOWFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
433 {
434 	ROLBrr(r, d);
435 }
436 LENDFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
437 
438 LOWFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
439 {
440 	SHLLrr(r, d);
441 }
442 LENDFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
443 
444 LOWFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
445 {
446 	SHLWrr(r, d);
447 }
448 LENDFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
449 
450 LOWFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
451 {
452 	SHLBrr(r, d);
453 }
454 LENDFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
455 
456 LOWFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
457 {
458 	RORBir(i, r);
459 }
460 LENDFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
461 
462 LOWFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
463 {
464 	RORWir(i, r);
465 }
466 LENDFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
467 
468 LOWFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
469 {
470 	ADDR32 ORLmr(s, X86_NOREG, X86_NOREG, 1, d);
471 }
472 LENDFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
473 
474 LOWFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
475 {
476 	RORLir(i, r);
477 }
478 LENDFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
479 
480 LOWFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
481 {
482 	RORLrr(r, d);
483 }
484 LENDFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
485 
486 LOWFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
487 {
488 	RORWrr(r, d);
489 }
490 LENDFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
491 
492 LOWFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
493 {
494 	RORBrr(r, d);
495 }
496 LENDFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
497 
498 LOWFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
499 {
500 	SHRLrr(r, d);
501 }
502 LENDFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
503 
504 LOWFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
505 {
506 	SHRWrr(r, d);
507 }
508 LENDFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
509 
510 LOWFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
511 {
512 	SHRBrr(r, d);
513 }
514 LENDFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
515 
516 LOWFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
517 {
518 	SARLrr(r, d);
519 }
520 LENDFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
521 
522 LOWFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
523 {
524 	SARWrr(r, d);
525 }
526 LENDFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
527 
528 LOWFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
529 {
530 	SARBrr(r, d);
531 }
532 LENDFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
533 
534 LOWFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
535 {
536 	SHLLir(i, r);
537 }
538 LENDFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
539 
540 LOWFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
541 {
542 	SHLWir(i, r);
543 }
544 LENDFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
545 
546 LOWFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
547 {
548 	SHLBir(i, r);
549 }
550 LENDFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
551 
552 LOWFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
553 {
554 	SHRLir(i, r);
555 }
556 LENDFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
557 
558 LOWFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
559 {
560 	SHRWir(i, r);
561 }
562 LENDFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
563 
564 LOWFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
565 {
566 	SHRBir(i, r);
567 }
568 LENDFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
569 
570 LOWFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
571 {
572 	SARLir(i, r);
573 }
574 LENDFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
575 
576 LOWFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
577 {
578 	SARWir(i, r);
579 }
580 LENDFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
581 
582 LOWFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
583 {
584 	SARBir(i, r);
585 }
586 LENDFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
587 
588 LOWFUNC(WRITE,NONE,1,raw_sahf,(R2))
589 {
590 	SAHF();
591 }
592 LENDFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah))
593 
594 LOWFUNC(NONE,NONE,1,raw_cpuid,(R4))
595 {
596 	CPUID();
597 }
598 LENDFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax))
599 
600 LOWFUNC(READ,NONE,1,raw_lahf,(W2))
601 {
602 	LAHF();
603 }
604 LENDFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah))
605 
606 LOWFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
607 {
608 	SETCCir(cc, d);
609 }
610 LENDFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
611 
612 LOWFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
613 {
614 	ADDR32 SETCCim(cc, d, X86_NOREG, X86_NOREG, 1);
615 }
616 LENDFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
617 
618 LOWFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
619 {
620 	if (have_cmov)
621 		CMOVLrr(cc, s, d);
622 	else { /* replacement using branch and mov */
623 		uae_s8 *target_p = (uae_s8 *)x86_get_target() + 1;
624 		JCCSii(cc^1, 0);
625 		MOVLrr(s, d);
626 		*target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
627 	}
628 }
629 LENDFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
630 
631 LOWFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
632 {
633 	BSFLrr(s, d);
634 }
635 LENDFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
636 
637 LOWFUNC(NONE,NONE,2,raw_sign_extend_32_rr,(W4 d, R4 s))
638 {
639 	MOVSLQrr(s, d);
640 }
641 LENDFUNC(NONE,NONE,2,raw_sign_extend_32_rr,(W4 d, R4 s))
642 
643 LOWFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
644 {
645 	MOVSWLrr(s, d);
646 }
647 LENDFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
648 
649 LOWFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
650 {
651 	MOVSBLrr(s, d);
652 }
653 LENDFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
654 
655 LOWFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
656 {
657 	MOVZWLrr(s, d);
658 }
659 LENDFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
660 
661 LOWFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
662 {
663 	MOVZBLrr(s, d);
664 }
665 LENDFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
666 
667 LOWFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
668 {
669 	IMULLrr(s, d);
670 }
671 LENDFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
672 
673 LOWFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
674 {
675 	if (d!=MUL_NREG1 || s!=MUL_NREG2) {
676 		jit_abort("Bad register in IMUL: d=%d, s=%d",d,s);
677 	}
678 	IMULLr(s);
679 }
680 LENDFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
681 
682 LOWFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
683 {
684 	if (d!=MUL_NREG1 || s!=MUL_NREG2) {
685 		jit_abort("Bad register in MUL: d=%d, s=%d",d,s);
686 	}
687 	MULLr(s);
688 }
689 LENDFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
690 
691 LOWFUNC(NONE,NONE,2,raw_mul_32_32,(RW4, R4))
692 {
693 	abort(); /* %^$&%^$%#^ x86! */
694 }
695 LENDFUNC(NONE,NONE,2,raw_mul_32_32,(RW4 d, R4 s))
696 
697 LOWFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
698 {
699 	MOVBrr(s, d);
700 }
701 LENDFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
702 
703 LOWFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
704 {
705 	MOVWrr(s, d);
706 }
707 LENDFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
708 
709 LOWFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
710 {
711 	ADDR32 MOVLmr(0, baser, index, factor, d);
712 }
713 LENDFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
714 
715 LOWFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
716 {
717 	ADDR32 MOVWmr(0, baser, index, factor, d);
718 }
719 LENDFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
720 
721 LOWFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
722 {
723 	ADDR32 MOVBmr(0, baser, index, factor, d);
724 }
725 LENDFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
726 
727 LOWFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
728 {
729 	ADDR32 MOVLrm(s, 0, baser, index, factor);
730 }
731 LENDFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
732 
733 LOWFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
734 {
735 	ADDR32 MOVWrm(s, 0, baser, index, factor);
736 }
737 LENDFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
738 
739 LOWFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
740 {
741 	ADDR32 MOVBrm(s, 0, baser, index, factor);
742 }
743 LENDFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
744 
745 LOWFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
746 {
747 	ADDR32 MOVLrm(s, base, baser, index, factor);
748 }
749 LENDFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
750 
751 LOWFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
752 {
753 	ADDR32 MOVWrm(s, base, baser, index, factor);
754 }
755 LENDFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
756 
757 LOWFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
758 {
759 	ADDR32 MOVBrm(s, base, baser, index, factor);
760 }
761 LENDFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
762 
763 LOWFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
764 {
765 	ADDR32 MOVLmr(base, baser, index, factor, d);
766 }
767 LENDFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
768 
769 LOWFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
770 {
771 	ADDR32 MOVWmr(base, baser, index, factor, d);
772 }
773 LENDFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
774 
775 LOWFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
776 {
777 	ADDR32 MOVBmr(base, baser, index, factor, d);
778 }
779 LENDFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
780 
781 LOWFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
782 {
783 	ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
784 }
785 LENDFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
786 
787 LOWFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
788 {
789 	if (have_cmov)
790 		ADDR32 CMOVLmr(cond, base, X86_NOREG, index, factor, d);
791 	else { /* replacement using branch and mov */
792 		uae_s8 *target_p = (uae_s8 *)x86_get_target() + 1;
793 		JCCSii(cond^1, 0);
794 		ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
795 		*target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
796 	}
797 }
798 LENDFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
799 
800 LOWFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
801 {
802 	if (have_cmov)
803 		CMOVLmr(cond, mem, X86_NOREG, X86_NOREG, 1, d);
804 	else { /* replacement using branch and mov */
805 		uae_s8 *target_p = (uae_s8 *)x86_get_target() + 1;
806 		JCCSii(cond^1, 0);
807 		ADDR32 MOVLmr(mem, X86_NOREG, X86_NOREG, 1, d);
808 		*target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
809 	}
810 }
811 LENDFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
812 
813 LOWFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
814 {
815 	ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
816 }
817 LENDFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
818 
819 LOWFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
820 {
821 	ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
822 }
823 LENDFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
824 
825 LOWFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
826 {
827 	ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
828 }
829 LENDFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
830 
831 LOWFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
832 {
833 	ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
834 }
835 LENDFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
836 
837 LOWFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
838 {
839 	ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
840 }
841 LENDFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
842 
843 LOWFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
844 {
845 	ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
846 }
847 LENDFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
848 
849 LOWFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
850 {
851 	ADDR32 MOVLim(i, offset, d, X86_NOREG, 1);
852 }
853 LENDFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
854 
855 LOWFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
856 {
857 	ADDR32 MOVWim(i, offset, d, X86_NOREG, 1);
858 }
859 LENDFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
860 
861 LOWFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
862 {
863 	ADDR32 MOVBim(i, offset, d, X86_NOREG, 1);
864 }
865 LENDFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
866 
867 LOWFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
868 {
869 	ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
870 }
871 LENDFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
872 
873 LOWFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
874 {
875 	ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
876 }
877 LENDFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
878 
879 LOWFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
880 {
881 	ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
882 }
883 LENDFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
884 
885 LOWFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
886 {
887 	ADDR32 LEALmr(offset, s, X86_NOREG, 1, d);
888 }
889 LENDFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
890 
891 LOWFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
892 {
893 	ADDR32 LEALmr(offset, s, index, factor, d);
894 }
895 LENDFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
896 
897 LOWFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
898 {
899 	ADDR32 LEALmr(0, s, index, factor, d);
900 }
901 LENDFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
902 
903 LOWFUNC(NONE,NONE,4,raw_lea_l_r_scaled,(W4 d, R4 index, IMM factor))
904 {
905 	ADDR32 LEALmr(0, X86_NOREG, index, factor, d);
906 }
907 LENDFUNC(NONE,NONE,4,raw_lea_l_r_scaled,(W4 d, R4 index, IMM factor))
908 
909 LOWFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
910 {
911 	ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
912 }
913 LENDFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
914 
915 LOWFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
916 {
917 	ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
918 }
919 LENDFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
920 
921 LOWFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
922 {
923 	ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
924 }
925 LENDFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
926 
927 LOWFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
928 {
929 	BSWAPLr(r);
930 }
931 LENDFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
932 
933 LOWFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
934 {
935 	ROLWir(8, r);
936 }
937 LENDFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
938 
939 LOWFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
940 {
941 	MOVLrr(s, d);
942 }
943 LENDFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
944 
945 LOWFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
946 {
947 	ADDR32 MOVLrm(s, d, X86_NOREG, X86_NOREG, 1);
948 }
949 LENDFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
950 
951 LOWFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
952 {
953 	ADDR32 MOVWrm(s, d, X86_NOREG, X86_NOREG, 1);
954 }
955 LENDFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
956 
957 LOWFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
958 {
959 	ADDR32 MOVWmr(s, X86_NOREG, X86_NOREG, 1, d);
960 }
961 LENDFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
962 
963 LOWFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
964 {
965 	ADDR32 MOVBrm(s, d, X86_NOREG, X86_NOREG, 1);
966 }
967 LENDFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
968 
969 LOWFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
970 {
971 	ADDR32 MOVBmr(s, X86_NOREG, X86_NOREG, 1, d);
972 }
973 LENDFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
974 
975 LOWFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
976 {
977 	MOVLir(s, d);
978 }
979 LENDFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
980 
981 LOWFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
982 {
983 	MOVWir(s, d);
984 }
985 LENDFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
986 
987 LOWFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
988 {
989 	MOVBir(s, d);
990 }
991 LENDFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
992 
993 LOWFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
994 {
995 	ADDR32 ADCLim(s, d, X86_NOREG, X86_NOREG, 1);
996 }
997 LENDFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
998 
999 LOWFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
1000 {
1001 	ADDR32 ADDLim(s, d, X86_NOREG, X86_NOREG, 1);
1002 }
1003 LENDFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
1004 
1005 LOWFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
1006 {
1007 	ADDR32 ADDWim(s, d, X86_NOREG, X86_NOREG, 1);
1008 }
1009 LENDFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
1010 
1011 LOWFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
1012 {
1013 	ADDR32 ADDBim(s, d, X86_NOREG, X86_NOREG, 1);
1014 }
1015 LENDFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
1016 
1017 LOWFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
1018 {
1019 	TESTLir(i, d);
1020 }
1021 LENDFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
1022 
1023 LOWFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
1024 {
1025 	TESTLrr(s, d);
1026 }
1027 LENDFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
1028 
1029 LOWFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
1030 {
1031 	TESTWrr(s, d);
1032 }
1033 LENDFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
1034 
1035 LOWFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
1036 {
1037 	TESTBrr(s, d);
1038 }
1039 LENDFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
1040 
1041 LOWFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
1042 {
1043 	XORLir(i, d);
1044 }
1045 LENDFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
1046 
1047 LOWFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
1048 {
1049 	ANDLir(i, d);
1050 }
1051 LENDFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
1052 
1053 LOWFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
1054 {
1055 	ANDWir(i, d);
1056 }
1057 LENDFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
1058 
1059 LOWFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
1060 {
1061 	ANDLrr(s, d);
1062 }
1063 LENDFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
1064 
1065 LOWFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
1066 {
1067 	ANDWrr(s, d);
1068 }
1069 LENDFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
1070 
1071 LOWFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
1072 {
1073 	ANDBrr(s, d);
1074 }
1075 LENDFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
1076 
1077 LOWFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
1078 {
1079 	ORLir(i, d);
1080 }
1081 LENDFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
1082 
1083 LOWFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
1084 {
1085 	ORLrr(s, d);
1086 }
1087 LENDFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
1088 
1089 LOWFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
1090 {
1091 	ORWrr(s, d);
1092 }
1093 LENDFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
1094 
1095 LOWFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
1096 {
1097 	ORBrr(s, d);
1098 }
1099 LENDFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
1100 
1101 LOWFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
1102 {
1103 	ADCLrr(s, d);
1104 }
1105 LENDFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
1106 
1107 LOWFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
1108 {
1109 	ADCWrr(s, d);
1110 }
1111 LENDFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
1112 
1113 LOWFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
1114 {
1115 	ADCBrr(s, d);
1116 }
1117 LENDFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
1118 
1119 LOWFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
1120 {
1121 	ADDLrr(s, d);
1122 }
1123 LENDFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
1124 
1125 LOWFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
1126 {
1127 	ADDWrr(s, d);
1128 }
1129 LENDFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
1130 
1131 LOWFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
1132 {
1133 	ADDBrr(s, d);
1134 }
1135 LENDFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
1136 
1137 LOWFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
1138 {
1139 	SUBLir(i, d);
1140 }
1141 LENDFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
1142 
1143 LOWFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
1144 {
1145 	SUBBir(i, d);
1146 }
1147 LENDFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
1148 
1149 LOWFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
1150 {
1151 	ADDLir(i, d);
1152 }
1153 LENDFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
1154 
1155 LOWFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
1156 {
1157 	ADDWir(i, d);
1158 }
1159 LENDFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
1160 
1161 LOWFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
1162 {
1163 	ADDBir(i, d);
1164 }
1165 LENDFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
1166 
1167 LOWFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
1168 {
1169 	SBBLrr(s, d);
1170 }
1171 LENDFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
1172 
1173 LOWFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
1174 {
1175 	SBBWrr(s, d);
1176 }
1177 LENDFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
1178 
1179 LOWFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
1180 {
1181 	SBBBrr(s, d);
1182 }
1183 LENDFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
1184 
1185 LOWFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
1186 {
1187 	SUBLrr(s, d);
1188 }
1189 LENDFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
1190 
1191 LOWFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
1192 {
1193 	SUBWrr(s, d);
1194 }
1195 LENDFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
1196 
1197 LOWFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
1198 {
1199 	SUBBrr(s, d);
1200 }
1201 LENDFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
1202 
1203 LOWFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
1204 {
1205 	CMPLrr(s, d);
1206 }
1207 LENDFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
1208 
1209 LOWFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
1210 {
1211 	CMPLir(i, r);
1212 }
1213 LENDFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
1214 
1215 LOWFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
1216 {
1217 	CMPWrr(s, d);
1218 }
1219 LENDFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
1220 
1221 LOWFUNC(WRITE,READ,2,raw_cmp_b_mi,(MEMR d, IMM s))
1222 {
1223 	ADDR32 CMPBim(s, d, X86_NOREG, X86_NOREG, 1);
1224 }
1225 LENDFUNC(WRITE,READ,2,raw_cmp_b_mi,(MEMR d, IMM s))
1226 
1227 LOWFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
1228 {
1229 	CMPBir(i, d);
1230 }
1231 LENDFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
1232 
1233 LOWFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
1234 {
1235 	CMPBrr(s, d);
1236 }
1237 LENDFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
1238 
1239 LOWFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
1240 {
1241 	ADDR32 CMPLmr(offset, X86_NOREG, index, factor, d);
1242 }
1243 LENDFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
1244 
1245 LOWFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
1246 {
1247 	XORLrr(s, d);
1248 }
1249 LENDFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
1250 
1251 LOWFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
1252 {
1253 	XORWrr(s, d);
1254 }
1255 LENDFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
1256 
1257 LOWFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
1258 {
1259 	XORBrr(s, d);
1260 }
1261 LENDFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
1262 
1263 LOWFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
1264 {
1265 	ADDR32 SUBLim(s, d, X86_NOREG, X86_NOREG, 1);
1266 }
1267 LENDFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
1268 
1269 LOWFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
1270 {
1271 	ADDR32 CMPLim(s, d, X86_NOREG, X86_NOREG, 1);
1272 }
1273 LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
1274 
1275 LOWFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
1276 {
1277 	XCHGLrr(r2, r1);
1278 }
1279 LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
1280 
1281 LOWFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
1282 {
1283 	XCHGBrr(r2, r1);
1284 }
1285 LENDFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
1286 
1287 LOWFUNC(READ,WRITE,0,raw_pushfl,(void))
1288 {
1289 	PUSHF();
1290 }
1291 LENDFUNC(READ,WRITE,0,raw_pushfl,(void))
1292 
1293 LOWFUNC(WRITE,READ,0,raw_popfl,(void))
1294 {
1295 	POPF();
1296 }
1297 LENDFUNC(WRITE,READ,0,raw_popfl,(void))
1298 
1299 /* Generate floating-point instructions */
x86_fadd_m(MEMR s)1300 static inline void x86_fadd_m(MEMR s)
1301 {
1302 	ADDR32 FADDLm(s,X86_NOREG,X86_NOREG,1);
1303 }
1304 
1305 #else
1306 
1307 const bool optimize_accum	= true;
1308 const bool optimize_imm8	= true;
1309 const bool optimize_shift_once	= true;
1310 
1311 /*************************************************************************
1312 * Actual encoding of the instructions on the target CPU                 *
1313 *************************************************************************/
1314 
isaccum(int r)1315 static inline int isaccum(int r)
1316 {
1317 	return (r == EAX_INDEX);
1318 }
1319 
isbyte(uae_s32 x)1320 static inline int isbyte(uae_s32 x)
1321 {
1322 	return (x>=-128 && x<=127);
1323 }
1324 
isword(uae_s32 x)1325 static inline int isword(uae_s32 x)
1326 {
1327 	return (x>=-32768 && x<=32767);
1328 }
1329 
1330 LOWFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
1331 {
1332 	emit_byte(0x50+r);
1333 }
1334 LENDFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
1335 
1336 LOWFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
1337 {
1338 	emit_byte(0x58+r);
1339 }
1340 LENDFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
1341 
1342 LOWFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
1343 {
1344 	emit_byte(0x8f);
1345 	emit_byte(0x05);
1346 	emit_long(d);
1347 }
1348 LENDFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
1349 
1350 LOWFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
1351 {
1352 	emit_byte(0x0f);
1353 	emit_byte(0xba);
1354 	emit_byte(0xe0+r);
1355 	emit_byte(i);
1356 }
1357 LENDFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
1358 
1359 LOWFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
1360 {
1361 	emit_byte(0x0f);
1362 	emit_byte(0xa3);
1363 	emit_byte(0xc0+8*b+r);
1364 }
1365 LENDFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
1366 
1367 LOWFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
1368 {
1369 	emit_byte(0x0f);
1370 	emit_byte(0xba);
1371 	emit_byte(0xf8+r);
1372 	emit_byte(i);
1373 }
1374 LENDFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
1375 
1376 LOWFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
1377 {
1378 	emit_byte(0x0f);
1379 	emit_byte(0xbb);
1380 	emit_byte(0xc0+8*b+r);
1381 }
1382 LENDFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
1383 
1384 
1385 LOWFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
1386 {
1387 	emit_byte(0x0f);
1388 	emit_byte(0xba);
1389 	emit_byte(0xf0+r);
1390 	emit_byte(i);
1391 }
1392 LENDFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
1393 
1394 LOWFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
1395 {
1396 	emit_byte(0x0f);
1397 	emit_byte(0xb3);
1398 	emit_byte(0xc0+8*b+r);
1399 }
1400 LENDFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
1401 
1402 LOWFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
1403 {
1404 	emit_byte(0x0f);
1405 	emit_byte(0xba);
1406 	emit_byte(0xe8+r);
1407 	emit_byte(i);
1408 }
1409 LENDFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
1410 
1411 LOWFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
1412 {
1413 	emit_byte(0x0f);
1414 	emit_byte(0xab);
1415 	emit_byte(0xc0+8*b+r);
1416 }
1417 LENDFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
1418 
1419 LOWFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
1420 {
1421 	emit_byte(0x66);
1422 	if (isbyte(i)) {
1423 		emit_byte(0x83);
1424 		emit_byte(0xe8+d);
1425 		emit_byte(i);
1426 	}
1427 	else {
1428 		if (optimize_accum && isaccum(d))
1429 			emit_byte(0x2d);
1430 		else {
1431 			emit_byte(0x81);
1432 			emit_byte(0xe8+d);
1433 		}
1434 		emit_word(i);
1435 	}
1436 }
1437 LENDFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
1438 
1439 
1440 LOWFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
1441 {
1442 	emit_byte(0x8b);
1443 	emit_byte(0x05+8*d);
1444 	emit_long(s);
1445 }
1446 LENDFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
1447 
1448 LOWFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
1449 {
1450 	emit_byte(0xc7);
1451 	emit_byte(0x05);
1452 	emit_long(d);
1453 	emit_long(s);
1454 }
1455 LENDFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
1456 
1457 LOWFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
1458 {
1459 	emit_byte(0x66);
1460 	emit_byte(0xc7);
1461 	emit_byte(0x05);
1462 	emit_long(d);
1463 	emit_word(s);
1464 }
1465 LENDFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
1466 
1467 LOWFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
1468 {
1469 	emit_byte(0xc6);
1470 	emit_byte(0x05);
1471 	emit_long(d);
1472 	emit_byte(s);
1473 }
1474 LENDFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
1475 
1476 LOWFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
1477 {
1478 	if (optimize_shift_once && (i == 1)) {
1479 		emit_byte(0xd0);
1480 		emit_byte(0x05);
1481 		emit_long(d);
1482 	}
1483 	else {
1484 		emit_byte(0xc0);
1485 		emit_byte(0x05);
1486 		emit_long(d);
1487 		emit_byte(i);
1488 	}
1489 }
1490 LENDFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
1491 
1492 LOWFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
1493 {
1494 	if (optimize_shift_once && (i == 1)) {
1495 		emit_byte(0xd0);
1496 		emit_byte(0xc0+r);
1497 	}
1498 	else {
1499 		emit_byte(0xc0);
1500 		emit_byte(0xc0+r);
1501 		emit_byte(i);
1502 	}
1503 }
1504 LENDFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
1505 
1506 LOWFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
1507 {
1508 	emit_byte(0x66);
1509 	emit_byte(0xc1);
1510 	emit_byte(0xc0+r);
1511 	emit_byte(i);
1512 }
1513 LENDFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
1514 
1515 LOWFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
1516 {
1517 	if (optimize_shift_once && (i == 1)) {
1518 		emit_byte(0xd1);
1519 		emit_byte(0xc0+r);
1520 	}
1521 	else {
1522 		emit_byte(0xc1);
1523 		emit_byte(0xc0+r);
1524 		emit_byte(i);
1525 	}
1526 }
1527 LENDFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
1528 
1529 LOWFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
1530 {
1531 	emit_byte(0xd3);
1532 	emit_byte(0xc0+d);
1533 }
1534 LENDFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
1535 
1536 LOWFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
1537 {
1538 	emit_byte(0x66);
1539 	emit_byte(0xd3);
1540 	emit_byte(0xc0+d);
1541 }
1542 LENDFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
1543 
1544 LOWFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
1545 {
1546 	emit_byte(0xd2);
1547 	emit_byte(0xc0+d);
1548 }
1549 LENDFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
1550 
1551 LOWFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
1552 {
1553 	emit_byte(0xd3);
1554 	emit_byte(0xe0+d);
1555 }
1556 LENDFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
1557 
1558 LOWFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
1559 {
1560 	emit_byte(0x66);
1561 	emit_byte(0xd3);
1562 	emit_byte(0xe0+d);
1563 }
1564 LENDFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
1565 
1566 LOWFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
1567 {
1568 	emit_byte(0xd2);
1569 	emit_byte(0xe0+d);
1570 }
1571 LENDFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
1572 
1573 LOWFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
1574 {
1575 	if (optimize_shift_once && (i == 1)) {
1576 		emit_byte(0xd0);
1577 		emit_byte(0xc8+r);
1578 	}
1579 	else {
1580 		emit_byte(0xc0);
1581 		emit_byte(0xc8+r);
1582 		emit_byte(i);
1583 	}
1584 }
1585 LENDFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
1586 
1587 LOWFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
1588 {
1589 	emit_byte(0x66);
1590 	emit_byte(0xc1);
1591 	emit_byte(0xc8+r);
1592 	emit_byte(i);
1593 }
1594 LENDFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
1595 
1596 // gb-- used for making an fpcr value in compemu_fpp.cpp
1597 LOWFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
1598 {
1599     emit_byte(0x0b);
1600     emit_byte(0x05+8*d);
1601     emit_long(s);
1602 }
1603 LENDFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
1604 
1605 LOWFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
1606 {
1607 	if (optimize_shift_once && (i == 1)) {
1608 		emit_byte(0xd1);
1609 		emit_byte(0xc8+r);
1610 	}
1611 	else {
1612 		emit_byte(0xc1);
1613 		emit_byte(0xc8+r);
1614 		emit_byte(i);
1615 	}
1616 }
1617 LENDFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
1618 
1619 LOWFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
1620 {
1621 	emit_byte(0xd3);
1622 	emit_byte(0xc8+d);
1623 }
1624 LENDFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
1625 
1626 LOWFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
1627 {
1628 	emit_byte(0x66);
1629 	emit_byte(0xd3);
1630 	emit_byte(0xc8+d);
1631 }
1632 LENDFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
1633 
1634 LOWFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
1635 {
1636 	emit_byte(0xd2);
1637 	emit_byte(0xc8+d);
1638 }
1639 LENDFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
1640 
1641 LOWFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
1642 {
1643 	emit_byte(0xd3);
1644 	emit_byte(0xe8+d);
1645 }
1646 LENDFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
1647 
1648 LOWFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
1649 {
1650 	emit_byte(0x66);
1651 	emit_byte(0xd3);
1652 	emit_byte(0xe8+d);
1653 }
1654 LENDFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
1655 
1656 LOWFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
1657 {
1658 	emit_byte(0xd2);
1659 	emit_byte(0xe8+d);
1660 }
1661 LENDFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
1662 
1663 LOWFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
1664 {
1665 	emit_byte(0xd3);
1666 	emit_byte(0xf8+d);
1667 }
1668 LENDFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
1669 
1670 LOWFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
1671 {
1672 	emit_byte(0x66);
1673 	emit_byte(0xd3);
1674 	emit_byte(0xf8+d);
1675 }
1676 LENDFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
1677 
1678 LOWFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
1679 {
1680 	emit_byte(0xd2);
1681 	emit_byte(0xf8+d);
1682 }
1683 LENDFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
1684 
1685 LOWFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
1686 {
1687 	if (optimize_shift_once && (i == 1)) {
1688 		emit_byte(0xd1);
1689 		emit_byte(0xe0+r);
1690 	}
1691 	else {
1692 		emit_byte(0xc1);
1693 		emit_byte(0xe0+r);
1694 		emit_byte(i);
1695 	}
1696 }
1697 LENDFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
1698 
1699 LOWFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
1700 {
1701 	emit_byte(0x66);
1702 	emit_byte(0xc1);
1703 	emit_byte(0xe0+r);
1704 	emit_byte(i);
1705 }
1706 LENDFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
1707 
1708 LOWFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
1709 {
1710 	if (optimize_shift_once && (i == 1)) {
1711 		emit_byte(0xd0);
1712 		emit_byte(0xe0+r);
1713 	}
1714 	else {
1715 		emit_byte(0xc0);
1716 		emit_byte(0xe0+r);
1717 		emit_byte(i);
1718 	}
1719 }
1720 LENDFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
1721 
1722 LOWFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
1723 {
1724 	if (optimize_shift_once && (i == 1)) {
1725 		emit_byte(0xd1);
1726 		emit_byte(0xe8+r);
1727 	}
1728 	else {
1729 		emit_byte(0xc1);
1730 		emit_byte(0xe8+r);
1731 		emit_byte(i);
1732 	}
1733 }
1734 LENDFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
1735 
1736 LOWFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
1737 {
1738 	emit_byte(0x66);
1739 	emit_byte(0xc1);
1740 	emit_byte(0xe8+r);
1741 	emit_byte(i);
1742 }
1743 LENDFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
1744 
1745 LOWFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
1746 {
1747 	if (optimize_shift_once && (i == 1)) {
1748 		emit_byte(0xd0);
1749 		emit_byte(0xe8+r);
1750 	}
1751 	else {
1752 		emit_byte(0xc0);
1753 		emit_byte(0xe8+r);
1754 		emit_byte(i);
1755 	}
1756 }
1757 LENDFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
1758 
1759 LOWFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
1760 {
1761 	if (optimize_shift_once && (i == 1)) {
1762 		emit_byte(0xd1);
1763 		emit_byte(0xf8+r);
1764 	}
1765 	else {
1766 		emit_byte(0xc1);
1767 		emit_byte(0xf8+r);
1768 		emit_byte(i);
1769 	}
1770 }
1771 LENDFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
1772 
1773 LOWFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
1774 {
1775 	emit_byte(0x66);
1776 	emit_byte(0xc1);
1777 	emit_byte(0xf8+r);
1778 	emit_byte(i);
1779 }
1780 LENDFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
1781 
1782 LOWFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
1783 {
1784 	if (optimize_shift_once && (i == 1)) {
1785 		emit_byte(0xd0);
1786 		emit_byte(0xf8+r);
1787 	}
1788 	else {
1789 		emit_byte(0xc0);
1790 		emit_byte(0xf8+r);
1791 		emit_byte(i);
1792 	}
1793 }
1794 LENDFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
1795 
1796 LOWFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah))
1797 {
1798 	emit_byte(0x9e);
1799 }
1800 LENDFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah))
1801 
1802 LOWFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax))
1803 {
1804 	emit_byte(0x0f);
1805 	emit_byte(0xa2);
1806 }
1807 LENDFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax))
1808 
1809 LOWFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah))
1810 {
1811 	emit_byte(0x9f);
1812 }
1813 LENDFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah))
1814 
1815 LOWFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
1816 {
1817 	emit_byte(0x0f);
1818 	emit_byte(0x90+cc);
1819 	emit_byte(0xc0+d);
1820 }
1821 LENDFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
1822 
1823 LOWFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
1824 {
1825 	emit_byte(0x0f);
1826 	emit_byte(0x90+cc);
1827 	emit_byte(0x05);
1828 	emit_long(d);
1829 }
1830 LENDFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
1831 
1832 LOWFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
1833 {
1834 	if (have_cmov) {
1835 		emit_byte(0x0f);
1836 		emit_byte(0x40+cc);
1837 		emit_byte(0xc0+8*d+s);
1838 	}
1839 	else { /* replacement using branch and mov */
1840 		int uncc=(cc^1);
1841 		emit_byte(0x70+uncc);
1842 		emit_byte(2);  /* skip next 2 bytes if not cc=true */
1843 		emit_byte(0x89);
1844 		emit_byte(0xc0+8*s+d);
1845 	}
1846 }
1847 LENDFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
1848 
1849 LOWFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
1850 {
1851 	emit_byte(0x0f);
1852 	emit_byte(0xbc);
1853 	emit_byte(0xc0+8*d+s);
1854 }
1855 LENDFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
1856 
1857 LOWFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
1858 {
1859 	emit_byte(0x0f);
1860 	emit_byte(0xbf);
1861 	emit_byte(0xc0+8*d+s);
1862 }
1863 LENDFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
1864 
1865 LOWFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
1866 {
1867 	emit_byte(0x0f);
1868 	emit_byte(0xbe);
1869 	emit_byte(0xc0+8*d+s);
1870 }
1871 LENDFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
1872 
1873 LOWFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
1874 {
1875 	emit_byte(0x0f);
1876 	emit_byte(0xb7);
1877 	emit_byte(0xc0+8*d+s);
1878 }
1879 LENDFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
1880 
1881 LOWFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
1882 {
1883 	emit_byte(0x0f);
1884 	emit_byte(0xb6);
1885 	emit_byte(0xc0+8*d+s);
1886 }
1887 LENDFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
1888 
1889 LOWFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
1890 {
1891 	emit_byte(0x0f);
1892 	emit_byte(0xaf);
1893 	emit_byte(0xc0+8*d+s);
1894 }
1895 LENDFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
1896 
1897 LOWFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
1898 {
1899 	if (d!=MUL_NREG1 || s!=MUL_NREG2) {
1900 		jit_abort("Bad register in IMUL: d=%d, s=%d\n",d,s);
1901 	}
1902 	emit_byte(0xf7);
1903 	emit_byte(0xea);
1904 }
1905 LENDFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
1906 
1907 LOWFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
1908 {
1909 	if (d!=MUL_NREG1 || s!=MUL_NREG2) {
1910 		jit_abort("Bad register in MUL: d=%d, s=%d",d,s);
1911 	}
1912 	emit_byte(0xf7);
1913 	emit_byte(0xe2);
1914 }
1915 LENDFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
1916 
1917 LOWFUNC(NONE,NONE,2,raw_mul_32_32,(RW4 d, R4 s))
1918 {
1919     abort(); /* %^$&%^$%#^ x86! */
1920     emit_byte(0x0f);
1921     emit_byte(0xaf);
1922     emit_byte(0xc0+8*d+s);
1923 }
1924 LENDFUNC(NONE,NONE,2,raw_mul_32_32,(RW4 d, R4 s))
1925 
1926 LOWFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
1927 {
1928 	emit_byte(0x88);
1929 	emit_byte(0xc0+8*s+d);
1930 }
1931 LENDFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
1932 
1933 LOWFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
1934 {
1935 	emit_byte(0x66);
1936 	emit_byte(0x89);
1937 	emit_byte(0xc0+8*s+d);
1938 }
1939 LENDFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
1940 
1941 LOWFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
1942 {
1943 	int isebp=(baser==5)?0x40:0;
1944 	int fi;
1945 
1946 	switch(factor) {
1947 		case 1: fi=0; break;
1948 		case 2: fi=1; break;
1949 		case 4: fi=2; break;
1950 		case 8: fi=3; break;
1951 		default: abort();
1952 	}
1953 
1954 
1955 	emit_byte(0x8b);
1956 	emit_byte(0x04+8*d+isebp);
1957 	emit_byte(baser+8*index+0x40*fi);
1958 	if (isebp)
1959 		emit_byte(0x00);
1960 }
1961 LENDFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
1962 
1963 LOWFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
1964 {
1965 	int fi;
1966 	int isebp;
1967 
1968 	switch(factor) {
1969 		case 1: fi=0; break;
1970 		case 2: fi=1; break;
1971 		case 4: fi=2; break;
1972 		case 8: fi=3; break;
1973 		default: abort();
1974 	}
1975 	isebp=(baser==5)?0x40:0;
1976 
1977 	emit_byte(0x66);
1978 	emit_byte(0x8b);
1979 	emit_byte(0x04+8*d+isebp);
1980 	emit_byte(baser+8*index+0x40*fi);
1981 	if (isebp)
1982 		emit_byte(0x00);
1983 	}
1984 LENDFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
1985 
1986 LOWFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
1987 {
1988 	int fi;
1989 	int isebp;
1990 
1991 	switch(factor) {
1992 		case 1: fi=0; break;
1993 		case 2: fi=1; break;
1994 		case 4: fi=2; break;
1995 		case 8: fi=3; break;
1996 		default: abort();
1997 	}
1998 	isebp=(baser==5)?0x40:0;
1999 
2000 	emit_byte(0x8a);
2001 	emit_byte(0x04+8*d+isebp);
2002 	emit_byte(baser+8*index+0x40*fi);
2003 	if (isebp)
2004 		emit_byte(0x00);
2005 }
2006 LENDFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
2007 
2008 LOWFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
2009 {
2010 	int fi;
2011 	int isebp;
2012 
2013 	switch(factor) {
2014 		case 1: fi=0; break;
2015 		case 2: fi=1; break;
2016 		case 4: fi=2; break;
2017 		case 8: fi=3; break;
2018 		default: abort();
2019 	}
2020 
2021 
2022 	isebp=(baser==5)?0x40:0;
2023 
2024 	emit_byte(0x89);
2025 	emit_byte(0x04+8*s+isebp);
2026 	emit_byte(baser+8*index+0x40*fi);
2027 	if (isebp)
2028 		emit_byte(0x00);
2029 }
2030 LENDFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
2031 
2032 LOWFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
2033 {
2034 	int fi;
2035 	int isebp;
2036 
2037 	switch(factor) {
2038 		case 1: fi=0; break;
2039 		case 2: fi=1; break;
2040 		case 4: fi=2; break;
2041 		case 8: fi=3; break;
2042 		default: abort();
2043 	}
2044 	isebp=(baser==5)?0x40:0;
2045 
2046 	emit_byte(0x66);
2047 	emit_byte(0x89);
2048 	emit_byte(0x04+8*s+isebp);
2049 	emit_byte(baser+8*index+0x40*fi);
2050 	if (isebp)
2051 		emit_byte(0x00);
2052 }
2053 LENDFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
2054 
2055 LOWFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
2056 {
2057 	int fi;
2058 	int isebp;
2059 
2060 	switch(factor) {
2061 		case 1: fi=0; break;
2062 		case 2: fi=1; break;
2063 		case 4: fi=2; break;
2064 		case 8: fi=3; break;
2065 		default: abort();
2066 	}
2067 	isebp=(baser==5)?0x40:0;
2068 
2069 	emit_byte(0x88);
2070 	emit_byte(0x04+8*s+isebp);
2071 	emit_byte(baser+8*index+0x40*fi);
2072 	if (isebp)
2073 		emit_byte(0x00);
2074 }
2075 LENDFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
2076 
2077 LOWFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
2078 {
2079 	int fi;
2080 
2081 	switch(factor) {
2082 	case 1: fi=0; break;
2083 	case 2: fi=1; break;
2084 	case 4: fi=2; break;
2085 	case 8: fi=3; break;
2086 	default: abort();
2087 	}
2088 
2089 	emit_byte(0x89);
2090 	emit_byte(0x84+8*s);
2091 	emit_byte(baser+8*index+0x40*fi);
2092 	emit_long(base);
2093 }
2094 LENDFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
2095 
2096 LOWFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
2097 {
2098 	int fi;
2099 
2100 	switch(factor) {
2101 	case 1: fi=0; break;
2102 	case 2: fi=1; break;
2103 	case 4: fi=2; break;
2104 	case 8: fi=3; break;
2105 	default: abort();
2106 	}
2107 
2108 	emit_byte(0x66);
2109 	emit_byte(0x89);
2110 	emit_byte(0x84+8*s);
2111 	emit_byte(baser+8*index+0x40*fi);
2112 	emit_long(base);
2113 }
2114 LENDFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
2115 
2116 LOWFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
2117 {
2118 	int fi;
2119 
2120 	switch(factor) {
2121 	case 1: fi=0; break;
2122 	case 2: fi=1; break;
2123 	case 4: fi=2; break;
2124 	case 8: fi=3; break;
2125 	default: abort();
2126 	}
2127 
2128 	emit_byte(0x88);
2129 	emit_byte(0x84+8*s);
2130 	emit_byte(baser+8*index+0x40*fi);
2131 	emit_long(base);
2132 	}
2133 LENDFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
2134 
2135 LOWFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
2136 {
2137 	int fi;
2138 
2139 	switch(factor) {
2140 	case 1: fi=0; break;
2141 	case 2: fi=1; break;
2142 	case 4: fi=2; break;
2143 	case 8: fi=3; break;
2144 	default: abort();
2145 	}
2146 
2147 	emit_byte(0x8b);
2148 	emit_byte(0x84+8*d);
2149 	emit_byte(baser+8*index+0x40*fi);
2150 	emit_long(base);
2151 }
2152 LENDFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
2153 
2154 LOWFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
2155 {
2156 	int fi;
2157 
2158 	switch(factor) {
2159 	case 1: fi=0; break;
2160 	case 2: fi=1; break;
2161 	case 4: fi=2; break;
2162 	case 8: fi=3; break;
2163 	default: abort();
2164 	}
2165 
2166 	emit_byte(0x66);
2167 	emit_byte(0x8b);
2168 	emit_byte(0x84+8*d);
2169 	emit_byte(baser+8*index+0x40*fi);
2170 	emit_long(base);
2171 }
2172 LENDFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
2173 
2174 LOWFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
2175 {
2176 	int fi;
2177 
2178 	switch(factor) {
2179 	case 1: fi=0; break;
2180 	case 2: fi=1; break;
2181 	case 4: fi=2; break;
2182 	case 8: fi=3; break;
2183 	default: abort();
2184 	}
2185 
2186 	emit_byte(0x8a);
2187 	emit_byte(0x84+8*d);
2188 	emit_byte(baser+8*index+0x40*fi);
2189 	emit_long(base);
2190 }
2191 LENDFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
2192 
2193 LOWFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
2194 {
2195 	int fi;
2196 	switch(factor) {
2197 	case 1: fi=0; break;
2198 	case 2: fi=1; break;
2199 	case 4: fi=2; break;
2200 	case 8: fi=3; break;
2201 	default:
2202 		fprintf(stderr,"Bad factor %d in mov_l_rm_indexed!\n",factor);
2203 		abort();
2204 	}
2205 	emit_byte(0x8b);
2206 	emit_byte(0x04+8*d);
2207 	emit_byte(0x05+8*index+64*fi);
2208 	emit_long(base);
2209 }
2210 LENDFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
2211 
2212 LOWFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
2213 {
2214 	int fi;
2215 	switch(factor) {
2216 	case 1: fi=0; break;
2217 	case 2: fi=1; break;
2218 	case 4: fi=2; break;
2219 	case 8: fi=3; break;
2220 	default:
2221 		fprintf(stderr,"Bad factor %d in mov_l_rm_indexed!\n",factor);
2222 		abort();
2223 	}
2224 	if (have_cmov) {
2225 		emit_byte(0x0f);
2226 		emit_byte(0x40+cond);
2227 		emit_byte(0x04+8*d);
2228 		emit_byte(0x05+8*index+64*fi);
2229 		emit_long(base);
2230 	}
2231 	else { /* replacement using branch and mov */
2232 		int uncc=(cond^1);
2233 		emit_byte(0x70+uncc);
2234 		emit_byte(7);  /* skip next 7 bytes if not cc=true */
2235 		emit_byte(0x8b);
2236 		emit_byte(0x04+8*d);
2237 		emit_byte(0x05+8*index+64*fi);
2238 		emit_long(base);
2239 	}
2240 }
2241 LENDFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
2242 
2243 LOWFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
2244 {
2245 	if (have_cmov) {
2246 		emit_byte(0x0f);
2247 		emit_byte(0x40+cond);
2248 		emit_byte(0x05+8*d);
2249 		emit_long(mem);
2250 	}
2251 	else { /* replacement using branch and mov */
2252 		int uncc=(cond^1);
2253 		emit_byte(0x70+uncc);
2254 		emit_byte(6);  /* skip next 6 bytes if not cc=true */
2255 		emit_byte(0x8b);
2256 		emit_byte(0x05+8*d);
2257 		emit_long(mem);
2258 	}
2259 }
2260 LENDFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
2261 
2262 LOWFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
2263 {
2264 	Dif(!isbyte(offset)) abort();
2265 	emit_byte(0x8b);
2266 	emit_byte(0x40+8*d+s);
2267 	emit_byte(offset);
2268 }
2269 LENDFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
2270 
2271 LOWFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
2272 {
2273 	Dif(!isbyte(offset)) abort();
2274 	emit_byte(0x66);
2275 	emit_byte(0x8b);
2276 	emit_byte(0x40+8*d+s);
2277 	emit_byte(offset);
2278 }
2279 LENDFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
2280 
2281 LOWFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
2282 {
2283 	Dif(!isbyte(offset)) abort();
2284 	emit_byte(0x8a);
2285 	emit_byte(0x40+8*d+s);
2286 	emit_byte(offset);
2287 }
2288 LENDFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
2289 
2290 LOWFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
2291 {
2292 	emit_byte(0x8b);
2293 	emit_byte(0x80+8*d+s);
2294 	emit_long(offset);
2295 }
2296 LENDFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
2297 
2298 LOWFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
2299 {
2300 	emit_byte(0x66);
2301 	emit_byte(0x8b);
2302 	emit_byte(0x80+8*d+s);
2303 	emit_long(offset);
2304 }
2305 LENDFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
2306 
2307 LOWFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
2308 {
2309 	emit_byte(0x8a);
2310 	emit_byte(0x80+8*d+s);
2311 	emit_long(offset);
2312 }
2313 LENDFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
2314 
2315 LOWFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
2316 {
2317 	Dif(!isbyte(offset)) abort();
2318 	emit_byte(0xc7);
2319 	emit_byte(0x40+d);
2320 	emit_byte(offset);
2321 	emit_long(i);
2322 }
2323 LENDFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
2324 
2325 LOWFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
2326 {
2327 	Dif(!isbyte(offset)) abort();
2328 	emit_byte(0x66);
2329 	emit_byte(0xc7);
2330 	emit_byte(0x40+d);
2331 	emit_byte(offset);
2332 	emit_word(i);
2333 }
2334 LENDFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
2335 
2336 LOWFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
2337 {
2338 	Dif(!isbyte(offset)) abort();
2339 	emit_byte(0xc6);
2340 	emit_byte(0x40+d);
2341 	emit_byte(offset);
2342 	emit_byte(i);
2343 }
2344 LENDFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
2345 
2346 LOWFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
2347 {
2348 	Dif(!isbyte(offset)) abort();
2349 	emit_byte(0x89);
2350 	emit_byte(0x40+8*s+d);
2351 	emit_byte(offset);
2352 }
2353 LENDFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
2354 
2355 LOWFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
2356 {
2357 	Dif(!isbyte(offset)) abort();
2358 	emit_byte(0x66);
2359 	emit_byte(0x89);
2360 	emit_byte(0x40+8*s+d);
2361 	emit_byte(offset);
2362 }
2363 LENDFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
2364 
2365 LOWFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
2366 {
2367 	Dif(!isbyte(offset)) abort();
2368 	emit_byte(0x88);
2369 	emit_byte(0x40+8*s+d);
2370 	emit_byte(offset);
2371 }
2372 LENDFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
2373 
2374 LOWFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
2375 {
2376 	if (optimize_imm8 && isbyte(offset)) {
2377 		emit_byte(0x8d);
2378 		emit_byte(0x40+8*d+s);
2379 		emit_byte(offset);
2380 	}
2381 	else {
2382 		emit_byte(0x8d);
2383 		emit_byte(0x80+8*d+s);
2384 		emit_long(offset);
2385 	}
2386 }
2387 LENDFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
2388 
2389 LOWFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
2390 {
2391 	int fi;
2392 
2393 	switch(factor) {
2394 	case 1: fi=0; break;
2395 	case 2: fi=1; break;
2396 	case 4: fi=2; break;
2397 	case 8: fi=3; break;
2398 	default: abort();
2399 	}
2400 
2401 	if (optimize_imm8 && isbyte(offset)) {
2402 		emit_byte(0x8d);
2403 		emit_byte(0x44+8*d);
2404 		emit_byte(0x40*fi+8*index+s);
2405 		emit_byte(offset);
2406 	}
2407 	else {
2408 		emit_byte(0x8d);
2409 		emit_byte(0x84+8*d);
2410 		emit_byte(0x40*fi+8*index+s);
2411 		emit_long(offset);
2412 	}
2413 }
2414 LENDFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
2415 
2416 LOWFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
2417 {
2418 	int isebp=(s==5)?0x40:0;
2419 	int fi;
2420 
2421 	switch(factor) {
2422 	case 1: fi=0; break;
2423 	case 2: fi=1; break;
2424 	case 4: fi=2; break;
2425 	case 8: fi=3; break;
2426 	default: abort();
2427 	}
2428 
2429 	emit_byte(0x8d);
2430 	emit_byte(0x04+8*d+isebp);
2431 	emit_byte(0x40*fi+8*index+s);
2432 	if (isebp)
2433 		emit_byte(0);
2434 }
2435 LENDFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
2436 
2437 LOWFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
2438 {
2439 	if (optimize_imm8 && isbyte(offset)) {
2440 		emit_byte(0x89);
2441 		emit_byte(0x40+8*s+d);
2442 		emit_byte(offset);
2443 	}
2444 	else {
2445 		emit_byte(0x89);
2446 		emit_byte(0x80+8*s+d);
2447 		emit_long(offset);
2448 	}
2449 }
2450 LENDFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
2451 
2452 LOWFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
2453 {
2454 	emit_byte(0x66);
2455 	emit_byte(0x89);
2456 	emit_byte(0x80+8*s+d);
2457 	emit_long(offset);
2458 }
2459 LENDFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
2460 
2461 LOWFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
2462 {
2463 	if (optimize_imm8 && isbyte(offset)) {
2464 		emit_byte(0x88);
2465 		emit_byte(0x40+8*s+d);
2466 		emit_byte(offset);
2467 	}
2468 	else {
2469 		emit_byte(0x88);
2470 		emit_byte(0x80+8*s+d);
2471 		emit_long(offset);
2472 	}
2473 }
2474 LENDFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
2475 
2476 LOWFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
2477 {
2478 	emit_byte(0x0f);
2479 	emit_byte(0xc8+r);
2480 }
2481 LENDFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
2482 
2483 LOWFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
2484 {
2485 	emit_byte(0x66);
2486 	emit_byte(0xc1);
2487 	emit_byte(0xc0+r);
2488 	emit_byte(0x08);
2489 }
2490 LENDFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
2491 
2492 LOWFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
2493 {
2494 	emit_byte(0x89);
2495 	emit_byte(0xc0+8*s+d);
2496 }
2497 LENDFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
2498 
2499 LOWFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
2500 {
2501 	emit_byte(0x89);
2502 	emit_byte(0x05+8*s);
2503 	emit_long(d);
2504 }
2505 LENDFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
2506 
2507 LOWFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
2508 {
2509 	emit_byte(0x66);
2510 	emit_byte(0x89);
2511 	emit_byte(0x05+8*s);
2512 	emit_long(d);
2513 }
2514 LENDFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
2515 
2516 LOWFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
2517 {
2518 	emit_byte(0x66);
2519 	emit_byte(0x8b);
2520 	emit_byte(0x05+8*d);
2521 	emit_long(s);
2522 }
2523 LENDFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
2524 
2525 LOWFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
2526 {
2527 	emit_byte(0x88);
2528 	emit_byte(0x05+8*(s&0xf)); /* XXX this handles %ah case (defined as 0x10+4) and others */
2529 	emit_long(d);
2530 }
2531 LENDFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
2532 
2533 LOWFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
2534 {
2535 	emit_byte(0x8a);
2536 	emit_byte(0x05+8*d);
2537 	emit_long(s);
2538 }
2539 LENDFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
2540 
2541 LOWFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
2542 {
2543 	emit_byte(0xb8+d);
2544 	emit_long(s);
2545 }
2546 LENDFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
2547 
2548 LOWFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
2549 {
2550 	emit_byte(0x66);
2551 	emit_byte(0xb8+d);
2552 	emit_word(s);
2553 }
2554 LENDFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
2555 
2556 LOWFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
2557 {
2558 	emit_byte(0xb0+d);
2559 	emit_byte(s);
2560 }
2561 LENDFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
2562 
2563 LOWFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
2564 {
2565 	emit_byte(0x81);
2566 	emit_byte(0x15);
2567 	emit_long(d);
2568 	emit_long(s);
2569 }
2570 LENDFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
2571 
2572 LOWFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
2573 {
2574 	if (optimize_imm8 && isbyte(s)) {
2575 		emit_byte(0x83);
2576 		emit_byte(0x05);
2577 		emit_long(d);
2578 		emit_byte(s);
2579 	}
2580 	else {
2581 		emit_byte(0x81);
2582 		emit_byte(0x05);
2583 		emit_long(d);
2584 		emit_long(s);
2585 	}
2586 }
2587 LENDFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
2588 
2589 LOWFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
2590 {
2591 	emit_byte(0x66);
2592 	emit_byte(0x81);
2593 	emit_byte(0x05);
2594 	emit_long(d);
2595 	emit_word(s);
2596 }
2597 LENDFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
2598 
2599 LOWFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
2600 {
2601 	emit_byte(0x80);
2602 	emit_byte(0x05);
2603 	emit_long(d);
2604 	emit_byte(s);
2605 }
2606 LENDFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
2607 
2608 LOWFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
2609 {
2610 	if (optimize_accum && isaccum(d))
2611 		emit_byte(0xa9);
2612 	else {
2613 		emit_byte(0xf7);
2614 		emit_byte(0xc0+d);
2615 	}
2616 	emit_long(i);
2617 }
2618 LENDFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
2619 
2620 LOWFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
2621 {
2622 	emit_byte(0x85);
2623 	emit_byte(0xc0+8*s+d);
2624 }
2625 LENDFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
2626 
2627 LOWFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
2628 {
2629 	emit_byte(0x66);
2630 	emit_byte(0x85);
2631 	emit_byte(0xc0+8*s+d);
2632 }
2633 LENDFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
2634 
2635 LOWFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
2636 {
2637 	emit_byte(0x84);
2638 	emit_byte(0xc0+8*s+d);
2639 }
2640 LENDFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
2641 
2642 LOWFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
2643 {
2644     emit_byte(0x81);
2645     emit_byte(0xf0+d);
2646     emit_long(i);
2647 }
2648 LENDFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
2649 
2650 LOWFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
2651 {
2652 	if (optimize_imm8 && isbyte(i)) {
2653 		emit_byte(0x83);
2654 		emit_byte(0xe0+d);
2655 		emit_byte(i);
2656 	}
2657 	else {
2658 		if (optimize_accum && isaccum(d))
2659 			emit_byte(0x25);
2660 		else {
2661 			emit_byte(0x81);
2662 			emit_byte(0xe0+d);
2663 		}
2664 		emit_long(i);
2665 	}
2666 }
2667 LENDFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
2668 
2669 LOWFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
2670 {
2671 	emit_byte(0x66);
2672 	if (optimize_imm8 && isbyte(i)) {
2673 		emit_byte(0x83);
2674 		emit_byte(0xe0+d);
2675 		emit_byte(i);
2676 	}
2677 	else {
2678 		if (optimize_accum && isaccum(d))
2679 			emit_byte(0x25);
2680 		else {
2681 			emit_byte(0x81);
2682 			emit_byte(0xe0+d);
2683 		}
2684 		emit_word(i);
2685 	}
2686 }
2687 LENDFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
2688 
2689 LOWFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
2690 {
2691 	emit_byte(0x21);
2692 	emit_byte(0xc0+8*s+d);
2693 }
2694 LENDFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
2695 
2696 LOWFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
2697 {
2698 	emit_byte(0x66);
2699 	emit_byte(0x21);
2700 	emit_byte(0xc0+8*s+d);
2701 }
2702 LENDFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
2703 
2704 LOWFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
2705 {
2706 	emit_byte(0x20);
2707 	emit_byte(0xc0+8*s+d);
2708 }
2709 LENDFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
2710 
2711 LOWFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
2712 {
2713 	if (optimize_imm8 && isbyte(i)) {
2714 		emit_byte(0x83);
2715 		emit_byte(0xc8+d);
2716 		emit_byte(i);
2717 	}
2718 	else {
2719 		if (optimize_accum && isaccum(d))
2720 			emit_byte(0x0d);
2721 		else {
2722 			emit_byte(0x81);
2723 			emit_byte(0xc8+d);
2724 	}
2725 	emit_long(i);
2726 	}
2727 }
2728 LENDFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
2729 
2730 LOWFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
2731 {
2732 	emit_byte(0x09);
2733 	emit_byte(0xc0+8*s+d);
2734 }
2735 LENDFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
2736 
2737 LOWFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
2738 {
2739 	emit_byte(0x66);
2740 	emit_byte(0x09);
2741 	emit_byte(0xc0+8*s+d);
2742 }
2743 LENDFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
2744 
2745 LOWFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
2746 {
2747 	emit_byte(0x08);
2748 	emit_byte(0xc0+8*s+d);
2749 }
2750 LENDFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
2751 
2752 LOWFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
2753 {
2754 	emit_byte(0x11);
2755 	emit_byte(0xc0+8*s+d);
2756 }
2757 LENDFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
2758 
2759 LOWFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
2760 {
2761 	emit_byte(0x66);
2762 	emit_byte(0x11);
2763 	emit_byte(0xc0+8*s+d);
2764 }
2765 LENDFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
2766 
2767 LOWFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
2768 {
2769 	emit_byte(0x10);
2770 	emit_byte(0xc0+8*s+d);
2771 }
2772 LENDFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
2773 
2774 LOWFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
2775 {
2776 	emit_byte(0x01);
2777 	emit_byte(0xc0+8*s+d);
2778 }
2779 LENDFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
2780 
2781 LOWFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
2782 {
2783 	emit_byte(0x66);
2784 	emit_byte(0x01);
2785 	emit_byte(0xc0+8*s+d);
2786 }
2787 LENDFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
2788 
2789 LOWFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
2790 {
2791 	emit_byte(0x00);
2792 	emit_byte(0xc0+8*s+d);
2793 }
2794 LENDFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
2795 
2796 LOWFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
2797 {
2798 	if (isbyte(i)) {
2799 		emit_byte(0x83);
2800 		emit_byte(0xe8+d);
2801 		emit_byte(i);
2802 	}
2803 	else {
2804 		if (optimize_accum && isaccum(d))
2805 			emit_byte(0x2d);
2806 		else {
2807 			emit_byte(0x81);
2808 			emit_byte(0xe8+d);
2809 		}
2810 		emit_long(i);
2811 	}
2812 }
2813 LENDFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
2814 
2815 LOWFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
2816 {
2817 	if (optimize_accum && isaccum(d))
2818 		emit_byte(0x2c);
2819 	else {
2820 		emit_byte(0x80);
2821 		emit_byte(0xe8+d);
2822 	}
2823 	emit_byte(i);
2824 }
2825 LENDFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
2826 
2827 LOWFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
2828 {
2829 	if (isbyte(i)) {
2830 		emit_byte(0x83);
2831 		emit_byte(0xc0+d);
2832 		emit_byte(i);
2833 	}
2834 	else {
2835 		if (optimize_accum && isaccum(d))
2836 			emit_byte(0x05);
2837 		else {
2838 			emit_byte(0x81);
2839 			emit_byte(0xc0+d);
2840 		}
2841 		emit_long(i);
2842 	}
2843 }
2844 LENDFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
2845 
2846 LOWFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
2847 {
2848 	emit_byte(0x66);
2849 	if (isbyte(i)) {
2850 		emit_byte(0x83);
2851 		emit_byte(0xc0+d);
2852 		emit_byte(i);
2853 	}
2854 	else {
2855 		if (optimize_accum && isaccum(d))
2856 			emit_byte(0x05);
2857 		else {
2858 			emit_byte(0x81);
2859 			emit_byte(0xc0+d);
2860 		}
2861 		emit_word(i);
2862 	}
2863 }
2864 LENDFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
2865 
2866 LOWFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
2867 {
2868 	if (optimize_accum && isaccum(d))
2869 		emit_byte(0x04);
2870 	else {
2871 		emit_byte(0x80);
2872 		emit_byte(0xc0+d);
2873 	}
2874 	emit_byte(i);
2875 }
2876 LENDFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
2877 
2878 LOWFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
2879 {
2880 	emit_byte(0x19);
2881 	emit_byte(0xc0+8*s+d);
2882 }
2883 LENDFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
2884 
2885 LOWFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
2886 {
2887 	emit_byte(0x66);
2888 	emit_byte(0x19);
2889 	emit_byte(0xc0+8*s+d);
2890 }
2891 LENDFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
2892 
2893 LOWFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
2894 {
2895 	emit_byte(0x18);
2896 	emit_byte(0xc0+8*s+d);
2897 }
2898 LENDFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
2899 
2900 LOWFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
2901 {
2902 	emit_byte(0x29);
2903 	emit_byte(0xc0+8*s+d);
2904 }
2905 LENDFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
2906 
2907 LOWFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
2908 {
2909 	emit_byte(0x66);
2910 	emit_byte(0x29);
2911 	emit_byte(0xc0+8*s+d);
2912 }
2913 LENDFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
2914 
2915 LOWFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
2916 {
2917 	emit_byte(0x28);
2918 	emit_byte(0xc0+8*s+d);
2919 }
2920 LENDFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
2921 
2922 LOWFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
2923 {
2924 	emit_byte(0x39);
2925 	emit_byte(0xc0+8*s+d);
2926 }
2927 LENDFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
2928 
2929 LOWFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
2930 {
2931 	if (optimize_imm8 && isbyte(i)) {
2932 		emit_byte(0x83);
2933 		emit_byte(0xf8+r);
2934 		emit_byte(i);
2935 	}
2936 	else {
2937 		if (optimize_accum && isaccum(r))
2938 			emit_byte(0x3d);
2939 		else {
2940 			emit_byte(0x81);
2941 			emit_byte(0xf8+r);
2942 		}
2943 		emit_long(i);
2944 	}
2945 }
2946 LENDFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
2947 
2948 LOWFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
2949 {
2950 	emit_byte(0x66);
2951 	emit_byte(0x39);
2952 	emit_byte(0xc0+8*s+d);
2953 }
2954 LENDFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
2955 
2956 LOWFUNC(WRITE,READ,2,raw_cmp_b_mi,(MEMR d, IMM s))
2957 {
2958     emit_byte(0x80);
2959     emit_byte(0x3d);
2960     emit_long(d);
2961     emit_byte(s);
2962 }
2963 LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
2964 
2965 LOWFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
2966 {
2967 	if (optimize_accum && isaccum(d))
2968 		emit_byte(0x3c);
2969 	else {
2970 		emit_byte(0x80);
2971 		emit_byte(0xf8+d);
2972 	}
2973 	emit_byte(i);
2974 }
2975 LENDFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
2976 
2977 LOWFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
2978 {
2979 	emit_byte(0x38);
2980 	emit_byte(0xc0+8*s+d);
2981 }
2982 LENDFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
2983 
2984 LOWFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
2985 {
2986     int fi;
2987 
2988     switch(factor) {
2989      case 1: fi=0; break;
2990      case 2: fi=1; break;
2991      case 4: fi=2; break;
2992      case 8: fi=3; break;
2993      default: abort();
2994     }
2995     emit_byte(0x39);
2996     emit_byte(0x04+8*d);
2997     emit_byte(5+8*index+0x40*fi);
2998     emit_long(offset);
2999 }
3000 LENDFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
3001 
3002 LOWFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
3003 {
3004 	emit_byte(0x31);
3005 	emit_byte(0xc0+8*s+d);
3006 }
3007 LENDFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
3008 
3009 LOWFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
3010 {
3011 	emit_byte(0x66);
3012 	emit_byte(0x31);
3013 	emit_byte(0xc0+8*s+d);
3014 }
3015 LENDFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
3016 
3017 LOWFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
3018 {
3019 	emit_byte(0x30);
3020 	emit_byte(0xc0+8*s+d);
3021 }
3022 LENDFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
3023 
3024 LOWFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
3025 {
3026 	if (optimize_imm8 && isbyte(s)) {
3027 		emit_byte(0x83);
3028 		emit_byte(0x2d);
3029 		emit_long(d);
3030 		emit_byte(s);
3031 	}
3032 	else {
3033 		emit_byte(0x81);
3034 		emit_byte(0x2d);
3035 		emit_long(d);
3036 		emit_long(s);
3037 	}
3038 }
3039 LENDFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
3040 
3041 LOWFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
3042 {
3043 	if (optimize_imm8 && isbyte(s)) {
3044 		emit_byte(0x83);
3045 		emit_byte(0x3d);
3046 		emit_long(d);
3047 		emit_byte(s);
3048 	}
3049 	else {
3050 		emit_byte(0x81);
3051 		emit_byte(0x3d);
3052 		emit_long(d);
3053 		emit_long(s);
3054 	}
3055 }
3056 LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
3057 
3058 LOWFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
3059 {
3060 	emit_byte(0x87);
3061 	emit_byte(0xc0+8*r1+r2);
3062 }
3063 LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
3064 
3065 LOWFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
3066 {
3067   emit_byte(0x86);
3068   emit_byte(0xc0+8*(r1&0xf)+(r2&0xf)); /* XXX this handles upper-halves registers (e.g. %ah defined as 0x10+4) */
3069 }
3070 LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
3071 
3072 /*************************************************************************
3073  * FIXME: mem access modes probably wrong                                *
3074  *************************************************************************/
3075 
3076 LOWFUNC(READ,WRITE,0,raw_pushfl,(void))
3077 {
3078 	emit_byte(0x9c);
3079 }
3080 LENDFUNC(READ,WRITE,0,raw_pushfl,(void))
3081 
3082 LOWFUNC(WRITE,READ,0,raw_popfl,(void))
3083 {
3084 	emit_byte(0x9d);
3085 }
3086 LENDFUNC(WRITE,READ,0,raw_popfl,(void))
3087 
3088 /* Generate floating-point instructions */
x86_fadd_m(MEMR s)3089 static inline void x86_fadd_m(MEMR s)
3090 {
3091 	emit_byte(0xdc);
3092 	emit_byte(0x05);
3093 	emit_long(s);
3094 }
3095 
3096 #endif
3097 
3098 /*************************************************************************
3099  * Unoptimizable stuff --- jump                                          *
3100  *************************************************************************/
3101 
raw_call_r(R4 r)3102 static inline void raw_call_r(R4 r)
3103 {
3104 #if USE_NEW_RTASM
3105     CALLsr(r);
3106 #else
3107 	emit_byte(0xff);
3108 	emit_byte(0xd0+r);
3109 #endif
3110 }
3111 
raw_call_m_indexed(uae_u32 base,uae_u32 r,uae_u32 m)3112 static inline void raw_call_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
3113 {
3114 #if USE_NEW_RTASM
3115 	ADDR32 CALLsm(base, X86_NOREG, r, m);
3116 #else
3117 	int mu;
3118 	switch(m) {
3119 		case 1: mu=0; break;
3120 		case 2: mu=1; break;
3121 		case 4: mu=2; break;
3122 		case 8: mu=3; break;
3123 		default: abort();
3124 	}
3125 	emit_byte(0xff);
3126 	emit_byte(0x14);
3127 	emit_byte(0x05+8*r+0x40*mu);
3128 	emit_long(base);
3129 #endif
3130 }
3131 
raw_jmp_r(R4 r)3132 static inline void raw_jmp_r(R4 r)
3133 {
3134 #if USE_NEW_RTASM
3135 	JMPsr(r);
3136 #else
3137 	emit_byte(0xff);
3138 	emit_byte(0xe0+r);
3139 #endif
3140 }
3141 
raw_jmp_m_indexed(uae_u32 base,uae_u32 r,uae_u32 m)3142 static inline void raw_jmp_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
3143 {
3144 #if USE_NEW_RTASM
3145 	ADDR32 JMPsm(base, X86_NOREG, r, m);
3146 #else
3147 	int mu;
3148 	switch (m) {
3149 		case 1: mu=0; break;
3150 		case 2: mu=1; break;
3151 		case 4: mu=2; break;
3152 		case 8: mu=3; break;
3153 		default: abort();
3154 	}
3155 	emit_byte(0xff);
3156 	emit_byte(0x24);
3157 	emit_byte(0x05+8*r+0x40*mu);
3158 	emit_long(base);
3159 #endif
3160 }
3161 
raw_jmp_m(uae_u32 base)3162 static inline void raw_jmp_m(uae_u32 base)
3163 {
3164 	emit_byte(0xff);
3165 	emit_byte(0x25);
3166 	emit_long(base);
3167 }
3168 
3169 
raw_call(uae_u32 t)3170 static inline void raw_call(uae_u32 t)
3171 {
3172 #if USE_NEW_RTASM
3173 	ADDR32 CALLm(t);
3174 #else
3175 	emit_byte(0xe8);
3176 	emit_long(t-(uintptr)target-4);
3177 #endif
3178 }
3179 
raw_jmp(uae_u32 t)3180 static inline void raw_jmp(uae_u32 t)
3181 {
3182 #if USE_NEW_RTASM
3183 	ADDR32 JMPm(t);
3184 #else
3185 	emit_byte(0xe9);
3186 	emit_long(t-(uintptr)target-4);
3187 #endif
3188 }
3189 
raw_jl(uae_u32 t)3190 static inline void raw_jl(uae_u32 t)
3191 {
3192 	emit_byte(0x0f);
3193 	emit_byte(0x8c);
3194 	emit_long(t-(uintptr)target-4);
3195 }
3196 
raw_jz(uae_u32 t)3197 static inline void raw_jz(uae_u32 t)
3198 {
3199 	emit_byte(0x0f);
3200 	emit_byte(0x84);
3201 	emit_long(t-(uintptr)target-4);
3202 }
3203 
raw_jnz(uae_u32 t)3204 static inline void raw_jnz(uae_u32 t)
3205 {
3206 	emit_byte(0x0f);
3207 	emit_byte(0x85);
3208 	emit_long(t-(uintptr)target-4);
3209 }
3210 
raw_jnz_l_oponly(void)3211 static inline void raw_jnz_l_oponly(void)
3212 {
3213 	emit_byte(0x0f);
3214 	emit_byte(0x85);
3215 }
3216 
raw_jcc_l_oponly(int cc)3217 static inline void raw_jcc_l_oponly(int cc)
3218 {
3219 	emit_byte(0x0f);
3220 	emit_byte(0x80+cc);
3221 }
3222 
raw_jnz_b_oponly(void)3223 static inline void raw_jnz_b_oponly(void)
3224 {
3225 	emit_byte(0x75);
3226 }
3227 
raw_jz_b_oponly(void)3228 static inline void raw_jz_b_oponly(void)
3229 {
3230 	emit_byte(0x74);
3231 }
3232 
raw_jcc_b_oponly(int cc)3233 static inline void raw_jcc_b_oponly(int cc)
3234 {
3235 	emit_byte(0x70+cc);
3236 }
3237 
raw_jmp_l_oponly(void)3238 static inline void raw_jmp_l_oponly(void)
3239 {
3240 	emit_byte(0xe9);
3241 }
3242 
raw_jmp_b_oponly(void)3243 static inline void raw_jmp_b_oponly(void)
3244 {
3245 	emit_byte(0xeb);
3246 }
3247 
raw_ret(void)3248 static inline void raw_ret(void)
3249 {
3250 	emit_byte(0xc3);
3251 }
3252 
raw_nop(void)3253 static inline void raw_nop(void)
3254 {
3255 	emit_byte(0x90);
3256 }
3257 
raw_emit_nop_filler(int nbytes)3258 static inline void raw_emit_nop_filler(int nbytes)
3259 {
3260 
3261 #if defined(CPU_x86_64)
3262   /* The recommended way to pad 64bit code is to use NOPs preceded by
3263      maximally four 0x66 prefixes.  Balance the size of nops.  */
3264   static const uae_u8 prefixes[4] = { 0x66, 0x66, 0x66, 0x66 };
3265   if (nbytes == 0)
3266 	  return;
3267 
3268   int i;
3269   int nnops = (nbytes + 3) / 4;
3270   int len = nbytes / nnops;
3271   int remains = nbytes - nnops * len;
3272 
3273   for (i = 0; i < remains; i++) {
3274 	  emit_block(prefixes, len);
3275 	  raw_nop();
3276   }
3277   for (; i < nnops; i++) {
3278 	  emit_block(prefixes, len - 1);
3279 	  raw_nop();
3280   }
3281 #else
3282   /* Source: GNU Binutils 2.12.90.0.15 */
3283   /* Various efficient no-op patterns for aligning code labels.
3284      Note: Don't try to assemble the instructions in the comments.
3285      0L and 0w are not legal.  */
3286   static const uae_u8 f32_1[] =
3287     {0x90};									/* nop					*/
3288   static const uae_u8 f32_2[] =
3289     {0x89,0xf6};							/* movl %esi,%esi		*/
3290   static const uae_u8 f32_3[] =
3291     {0x8d,0x76,0x00};						/* leal 0(%esi),%esi	*/
3292   static const uae_u8 f32_4[] =
3293     {0x8d,0x74,0x26,0x00};					/* leal 0(%esi,1),%esi	*/
3294   static const uae_u8 f32_5[] =
3295     {0x90,									/* nop					*/
3296      0x8d,0x74,0x26,0x00};					/* leal 0(%esi,1),%esi	*/
3297   static const uae_u8 f32_6[] =
3298     {0x8d,0xb6,0x00,0x00,0x00,0x00};		/* leal 0L(%esi),%esi	*/
3299   static const uae_u8 f32_7[] =
3300     {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};	/* leal 0L(%esi,1),%esi */
3301   static const uae_u8 f32_8[] =
3302     {0x90,									/* nop					*/
3303      0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};	/* leal 0L(%esi,1),%esi */
3304   static const uae_u8 f32_9[] =
3305     {0x89,0xf6,								/* movl %esi,%esi		*/
3306      0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};	/* leal 0L(%edi,1),%edi */
3307   static const uae_u8 f32_10[] =
3308     {0x8d,0x76,0x00,						/* leal 0(%esi),%esi	*/
3309      0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};	/* leal 0L(%edi,1),%edi */
3310   static const uae_u8 f32_11[] =
3311     {0x8d,0x74,0x26,0x00,					/* leal 0(%esi,1),%esi	*/
3312      0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};	/* leal 0L(%edi,1),%edi */
3313   static const uae_u8 f32_12[] =
3314     {0x8d,0xb6,0x00,0x00,0x00,0x00,			/* leal 0L(%esi),%esi	*/
3315      0x8d,0xbf,0x00,0x00,0x00,0x00};		/* leal 0L(%edi),%edi	*/
3316   static const uae_u8 f32_13[] =
3317     {0x8d,0xb6,0x00,0x00,0x00,0x00,			/* leal 0L(%esi),%esi	*/
3318      0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};	/* leal 0L(%edi,1),%edi */
3319   static const uae_u8 f32_14[] =
3320     {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,	/* leal 0L(%esi,1),%esi */
3321      0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};	/* leal 0L(%edi,1),%edi */
3322   static const uae_u8 f32_15[] =
3323     {0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,	/* jmp .+15; lotsa nops	*/
3324      0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
3325   static const uae_u8 f32_16[] =
3326     {0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,	/* jmp .+15; lotsa nops	*/
3327      0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
3328   static const uae_u8 *const f32_patt[] = {
3329     f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
3330     f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
3331   };
3332 
3333   int nloops = nbytes / 16;
3334   while (nloops-- > 0)
3335 	emit_block(f32_16, sizeof(f32_16));
3336 
3337   nbytes %= 16;
3338   if (nbytes)
3339 	emit_block(f32_patt[nbytes - 1], nbytes);
3340 #endif
3341 }
3342 
3343 
3344 /*************************************************************************
3345 * Flag handling, to and fro UAE flag register                           *
3346 *************************************************************************/
3347 
raw_flags_evicted(int r)3348 static inline void raw_flags_evicted(int r)
3349 {
3350 	//live.state[FLAGTMP].status=CLEAN;
3351 	live.state[FLAGTMP].status=INMEM;
3352 	live.state[FLAGTMP].realreg=-1;
3353 	/* We just "evicted" FLAGTMP. */
3354 	if (live.nat[r].nholds!=1) {
3355 		/* Huh? */
3356 		abort();
3357 	}
3358 	live.nat[r].nholds=0;
3359 }
3360 
3361 #define FLAG_NREG1_FLAGREG 0  /* Set to -1 if any register will do */
raw_flags_to_reg_FLAGREG(int r)3362 static inline void raw_flags_to_reg_FLAGREG(int r)
3363 {
3364 	raw_lahf(0);  /* Most flags in AH */
3365 	//raw_setcc(r,0); /* V flag in AL */
3366 	raw_setcc_m((uintptr)live.state[FLAGTMP].mem,0);
3367 
3368 #if 1   /* Let's avoid those nasty partial register stalls */
3369 	//raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem,r);
3370 	raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem)+1,AH_INDEX);
3371 	raw_flags_evicted(r);
3372 #endif
3373 }
3374 
3375 #define FLAG_NREG2_FLAGREG 0  /* Set to -1 if any register will do */
raw_reg_to_flags_FLAGREG(int r)3376 static inline void raw_reg_to_flags_FLAGREG(int r)
3377 {
3378 	raw_cmp_b_ri(r,-127); /* set V */
3379 	raw_sahf(0);
3380 }
3381 
3382 #define FLAG_NREG3_FLAGREG 0  /* Set to -1 if any register will do */
raw_flags_set_zero_FLAGREG(int s,int tmp)3383 static __inline__ void raw_flags_set_zero_FLAGREG(int s, int tmp)
3384 {
3385     raw_mov_l_rr(tmp,s);
3386     raw_lahf(s); /* flags into ah */
3387     raw_and_l_ri(s,0xffffbfff);
3388     raw_and_l_ri(tmp,0x00004000);
3389     raw_xor_l_ri(tmp,0x00004000);
3390     raw_or_l(s,tmp);
3391     raw_sahf(s);
3392 }
3393 
raw_flags_init_FLAGREG(void)3394 static inline void raw_flags_init_FLAGREG(void) { }
3395 
3396 #define FLAG_NREG1_FLAGSTK -1  /* Set to -1 if any register will do */
raw_flags_to_reg_FLAGSTK(int r)3397 static inline void raw_flags_to_reg_FLAGSTK(int r)
3398 {
3399 	raw_pushfl();
3400 	raw_pop_l_r(r);
3401 	raw_mov_l_mr((uintptr)live.state[FLAGTMP].mem,r);
3402 	raw_flags_evicted(r);
3403 }
3404 
3405 #define FLAG_NREG2_FLAGSTK -1  /* Set to -1 if any register will do */
raw_reg_to_flags_FLAGSTK(int r)3406 static inline void raw_reg_to_flags_FLAGSTK(int r)
3407 {
3408 	raw_push_l_r(r);
3409 	raw_popfl();
3410 }
3411 
3412 #define FLAG_NREG3_FLAGSTK -1  /* Set to -1 if any register will do */
raw_flags_set_zero_FLAGSTK(int s,int tmp)3413 static inline void raw_flags_set_zero_FLAGSTK(int s, int tmp)
3414 {
3415     raw_mov_l_rr(tmp,s);
3416     raw_pushfl();
3417     raw_pop_l_r(s);
3418     raw_and_l_ri(s,0xffffffbf);
3419     raw_and_l_ri(tmp,0x00000040);
3420     raw_xor_l_ri(tmp,0x00000040);
3421     raw_or_l(s,tmp);
3422     raw_push_l_r(s);
3423     raw_popfl();
3424 }
3425 
raw_flags_init_FLAGSTK(void)3426 static inline void raw_flags_init_FLAGSTK(void) { }
3427 
3428 #if defined(CPU_x86_64)
3429 /* Try to use the LAHF/SETO method on x86_64 since it is faster.
3430    This can't be the default because some older CPUs don't support
3431    LAHF/SAHF in long mode.  */
3432 static int FLAG_NREG1_FLAGGEN = 0;
raw_flags_to_reg_FLAGGEN(int r)3433 static inline void raw_flags_to_reg_FLAGGEN(int r)
3434 {
3435 	if (have_lahf_lm) {
3436 		// NOTE: the interpreter uses the normal EFLAGS layout
3437 		//   pushf/popf CF(0) ZF( 6) SF( 7) OF(11)
3438 		//   sahf/lahf  CF(8) ZF(14) SF(15) OF( 0)
3439 		assert(r == 0);
3440 		raw_setcc(r,0);					/* V flag in AL */
3441 		raw_lea_l_r_scaled(0,0,8);		/* move it to its EFLAGS location */
3442 		raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem)+1,0);
3443 		raw_lahf(0);					/* most flags in AH */
3444 		raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem,AH_INDEX);
3445 		raw_flags_evicted(r);
3446 	}
3447 	else
3448 		raw_flags_to_reg_FLAGSTK(r);
3449 }
3450 
3451 static int FLAG_NREG2_FLAGGEN = 0;
raw_reg_to_flags_FLAGGEN(int r)3452 static inline void raw_reg_to_flags_FLAGGEN(int r)
3453 {
3454 	if (have_lahf_lm) {
3455 		raw_xchg_b_rr(0,AH_INDEX);
3456 		raw_cmp_b_ri(r,-120); /* set V */
3457 		raw_sahf(0);
3458 	}
3459 	else
3460 		raw_reg_to_flags_FLAGSTK(r);
3461 }
3462 
3463 static int FLAG_NREG3_FLAGGEN = 0;
raw_flags_set_zero_FLAGGEN(int s,int tmp)3464 static inline void raw_flags_set_zero_FLAGGEN(int s, int tmp)
3465 {
3466 	if (have_lahf_lm)
3467 		raw_flags_set_zero_FLAGREG(s, tmp);
3468 	else
3469 		raw_flags_set_zero_FLAGSTK(s, tmp);
3470 }
3471 
raw_flags_init_FLAGGEN(void)3472 static inline void raw_flags_init_FLAGGEN(void)
3473 {
3474 	if (have_lahf_lm) {
3475 		FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGREG;
3476 		FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGREG;
3477 		FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGREG;
3478 	}
3479 	else {
3480 		FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGSTK;
3481 		FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGSTK;
3482 		FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGSTK;
3483 	}
3484 }
3485 #endif
3486 
3487 #ifdef SAHF_SETO_PROFITABLE
3488 #define FLAG_SUFFIX FLAGREG
3489 #elif defined CPU_x86_64
3490 #define FLAG_SUFFIX FLAGGEN
3491 #else
3492 #define FLAG_SUFFIX FLAGSTK
3493 #endif
3494 
3495 #define FLAG_GLUE_2(x, y)		x ## _ ## y
3496 #define FLAG_GLUE_1(x, y)		FLAG_GLUE_2(x, y)
3497 #define FLAG_GLUE(x)			FLAG_GLUE_1(x, FLAG_SUFFIX)
3498 
3499 #define raw_flags_init			FLAG_GLUE(raw_flags_init)
3500 #define FLAG_NREG1				FLAG_GLUE(FLAG_NREG1)
3501 #define raw_flags_to_reg		FLAG_GLUE(raw_flags_to_reg)
3502 #define FLAG_NREG2				FLAG_GLUE(FLAG_NREG2)
3503 #define raw_reg_to_flags		FLAG_GLUE(raw_reg_to_flags)
3504 #define FLAG_NREG3				FLAG_GLUE(FLAG_NREG3)
3505 #define raw_flags_set_zero		FLAG_GLUE(raw_flags_set_zero)
3506 
3507 /* Apparently, there are enough instructions between flag store and
3508 flag reload to avoid the partial memory stall */
raw_load_flagreg(uae_u32 target,uae_u32 r)3509 static inline void raw_load_flagreg(uae_u32 target, uae_u32 r)
3510 {
3511 #if 1
3512 	raw_mov_l_rm(target,(uintptr)live.state[r].mem);
3513 #else
3514 	raw_mov_b_rm(target,(uintptr)live.state[r].mem);
3515 	raw_mov_b_rm(target+4,((uintptr)live.state[r].mem)+1);
3516 #endif
3517 }
3518 
3519 #ifdef UAE
3520 /* FLAGX is word-sized */
3521 #else
3522 /* FLAGX is byte sized, and we *do* write it at that size */
3523 #endif
raw_load_flagx(uae_u32 target,uae_u32 r)3524 static inline void raw_load_flagx(uae_u32 target, uae_u32 r)
3525 {
3526 #ifdef UAE
3527 	if (live.nat[target].canword)
3528 #else
3529 	if (live.nat[target].canbyte)
3530 		raw_mov_b_rm(target,(uintptr)live.state[r].mem);
3531 	else if (live.nat[target].canword)
3532 #endif
3533 		raw_mov_w_rm(target,(uintptr)live.state[r].mem);
3534 	else
3535 		raw_mov_l_rm(target,(uintptr)live.state[r].mem);
3536 }
3537 
raw_dec_sp(int off)3538 static inline void raw_dec_sp(int off)
3539 {
3540 	if (off) {
3541 #ifdef CPU_x86_64
3542 		emit_byte(0x48); /* REX prefix */
3543 #endif
3544 		raw_sub_l_ri(ESP_INDEX,off);
3545 	}
3546 }
3547 
raw_inc_sp(int off)3548 static inline void raw_inc_sp(int off)
3549 {
3550 	if (off) {
3551 #ifdef CPU_x86_64
3552 		emit_byte(0x48); /* REX prefix */
3553 #endif
3554 		raw_add_l_ri(ESP_INDEX,off);
3555 	}
3556 }
3557 
raw_push_regs_to_preserve(void)3558 static inline void raw_push_regs_to_preserve(void) {
3559 	for (int i=N_REGS;i--;) {
3560 		if (need_to_preserve[i])
3561 			raw_push_l_r(i);
3562 	}
3563 }
3564 
raw_pop_preserved_regs(void)3565 static inline void raw_pop_preserved_regs(void) {
3566 	for (int i=0;i<N_REGS;i++) {
3567 		if (need_to_preserve[i])
3568 			raw_pop_l_r(i);
3569 	}
3570 }
3571 
3572 /*************************************************************************
3573  * Handling mistaken direct memory access (removed from ARAnyM sources)  *
3574  *************************************************************************/
3575 
3576 #ifdef UAE
3577 #include "exception_handler.cpp"
3578 #endif
3579 
3580 static
compiler_status()3581 void compiler_status() {
3582 	jit_log("compiled code starts at %p, current at %p (size 0x%x)", compiled_code, current_compile_p, (unsigned int)(current_compile_p - compiled_code));
3583 }
3584 
3585 /*************************************************************************
3586 * Checking for CPU features                                             *
3587 *************************************************************************/
3588 
3589 struct cpuinfo_x86 {
3590 	uae_u8	x86;			// CPU family
3591 	uae_u8	x86_vendor;		// CPU vendor
3592 	uae_u8	x86_processor;	// CPU canonical processor type
3593 	uae_u8	x86_brand_id;	// CPU BrandID if supported, yield 0 otherwise
3594 	uae_u32	x86_hwcap;
3595 	uae_u8	x86_model;
3596 	uae_u8	x86_mask;
3597 	bool	x86_has_xmm2;
3598 	int	cpuid_level;    // Maximum supported CPUID level, -1=no CPUID
3599 	char	x86_vendor_id[16];
3600 	uintptr	x86_clflush_size;
3601 };
3602 struct cpuinfo_x86 cpuinfo;
3603 
3604 enum {
3605 	X86_VENDOR_INTEL		= 0,
3606 	X86_VENDOR_CYRIX		= 1,
3607 	X86_VENDOR_AMD		= 2,
3608 	X86_VENDOR_UMC		= 3,
3609 	X86_VENDOR_NEXGEN		= 4,
3610 	X86_VENDOR_CENTAUR	= 5,
3611 	X86_VENDOR_RISE		= 6,
3612 	X86_VENDOR_TRANSMETA	= 7,
3613 	X86_VENDOR_NSC		= 8,
3614 	X86_VENDOR_UNKNOWN	= 0xff
3615 };
3616 
3617 enum {
3618 	X86_PROCESSOR_I386,                       /* 80386 */
3619 	X86_PROCESSOR_I486,                       /* 80486DX, 80486SX, 80486DX[24] */
3620 	X86_PROCESSOR_PENTIUM,
3621 	X86_PROCESSOR_PENTIUMPRO,
3622 	X86_PROCESSOR_K6,
3623 	X86_PROCESSOR_ATHLON,
3624 	X86_PROCESSOR_PENTIUM4,
3625 	X86_PROCESSOR_X86_64,
3626 	X86_PROCESSOR_max
3627 };
3628 
3629 static const char * x86_processor_string_table[X86_PROCESSOR_max] = {
3630 	"80386",
3631 	"80486",
3632 	"Pentium",
3633 	"PentiumPro",
3634 	"K6",
3635 	"Athlon",
3636 	"Pentium4",
3637 	"x86-64"
3638 };
3639 
3640 static struct ptt {
3641 	const int align_loop;
3642 	const int align_loop_max_skip;
3643 	const int align_jump;
3644 	const int align_jump_max_skip;
3645 	const int align_func;
3646 }
3647 x86_alignments[X86_PROCESSOR_max] = {
3648 	{  4,  3,  4,  3,  4 },
3649 	{ 16, 15, 16, 15, 16 },
3650 	{ 16,  7, 16,  7, 16 },
3651 	{ 16, 15, 16,  7, 16 },
3652 	{ 32,  7, 32,  7, 32 },
3653 	{ 16,  7, 16,  7, 16 },
3654 	{  0,  0,  0,  0,  0 },
3655 	{ 16,  7, 16,  7, 16 }
3656 };
3657 
3658 static void
x86_get_cpu_vendor(struct cpuinfo_x86 * c)3659 	x86_get_cpu_vendor(struct cpuinfo_x86 *c)
3660 {
3661 	char *v = c->x86_vendor_id;
3662 
3663 	if (!strcmp(v, "GenuineIntel"))
3664 		c->x86_vendor = X86_VENDOR_INTEL;
3665 	else if (!strcmp(v, "AuthenticAMD"))
3666 		c->x86_vendor = X86_VENDOR_AMD;
3667 	else if (!strcmp(v, "CyrixInstead"))
3668 		c->x86_vendor = X86_VENDOR_CYRIX;
3669 	else if (!strcmp(v, "Geode by NSC"))
3670 		c->x86_vendor = X86_VENDOR_NSC;
3671 	else if (!strcmp(v, "UMC UMC UMC "))
3672 		c->x86_vendor = X86_VENDOR_UMC;
3673 	else if (!strcmp(v, "CentaurHauls"))
3674 		c->x86_vendor = X86_VENDOR_CENTAUR;
3675 	else if (!strcmp(v, "NexGenDriven"))
3676 		c->x86_vendor = X86_VENDOR_NEXGEN;
3677 	else if (!strcmp(v, "RiseRiseRise"))
3678 		c->x86_vendor = X86_VENDOR_RISE;
3679 	else if (!strcmp(v, "GenuineTMx86") ||
3680 		!strcmp(v, "TransmetaCPU"))
3681 		c->x86_vendor = X86_VENDOR_TRANSMETA;
3682 	else
3683 		c->x86_vendor = X86_VENDOR_UNKNOWN;
3684 }
3685 
3686 /*
3687  * Generic CPUID function
3688  * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
3689  * resulting in stale register contents being returned.
3690  */
3691 /* Some CPUID calls want 'count' to be placed in ecx */
3692 #ifdef __GNUC__
cpuid_count(uae_u32 op,uae_u32 count,uae_u32 * eax,uae_u32 * ebx,uae_u32 * ecx,uae_u32 * edx)3693 static void cpuid_count(uae_u32 op, uae_u32 count, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx)
3694 {
3695 	uae_u32 _eax, _ebx, _ecx, _edx;
3696 	_eax = op;
3697 	_ecx = count;
3698 	__asm__ __volatile__(
3699 	"   movl %0,%%eax \n"
3700 	"   movl %2,%%ecx \n"
3701 	"	cpuid \n"
3702 	"   movl %%eax,%0 \n"
3703 	"   movl %%ebx,%1 \n"
3704 	"   movl %%ecx,%2 \n"
3705 	"   movl %%edx,%3 \n"
3706 		: "+m" (_eax),
3707 		  "=m" (_ebx),
3708 		  "+m" (_ecx),
3709 		  "=m" (_edx)
3710 		:
3711 		: "eax", "ebx", "ecx", "edx");
3712 	*eax = _eax;
3713 	*ebx = _ebx;
3714 	*ecx = _ecx;
3715 	*edx = _edx;
3716 }
3717 #endif
3718 
3719 #ifdef _MSC_VER
cpuid_count(uae_u32 op,uae_u32 count,uae_u32 * eax,uae_u32 * ebx,uae_u32 * ecx,uae_u32 * edx)3720 static void cpuid_count(uae_u32 op, uae_u32 count, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx)
3721 {
3722 	int cpuinfo[4];
3723 	cpuinfo[0] = op;
3724 	cpuinfo[1] = 0;
3725 	cpuinfo[2] = count;
3726 	cpuinfo[3] = 0;
3727 	__cpuidex(cpuinfo, op, count);
3728 	*eax = cpuinfo[0];
3729 	*ebx = cpuinfo[1];
3730 	*ecx = cpuinfo[2];
3731 	*edx = cpuinfo[3];
3732 	}
3733 #endif
3734 
3735 static void
cpuid(uae_u32 op,uae_u32 * eax,uae_u32 * ebx,uae_u32 * ecx,uae_u32 * edx)3736 cpuid(uae_u32 op, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx)
3737 {
3738 	cpuid_count(op, 0, eax, ebx, ecx, edx);
3739 }
3740 
3741 static void
raw_init_cpu(void)3742 raw_init_cpu(void)
3743 {
3744 	struct cpuinfo_x86 *c = &cpuinfo;
3745 	uae_u32 dummy;
3746 
3747 	/* Defaults */
3748 	c->x86_processor = X86_PROCESSOR_max;
3749 	c->x86_vendor = X86_VENDOR_UNKNOWN;
3750 	c->cpuid_level = -1;				/* CPUID not detected */
3751 	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
3752 	c->x86_vendor_id[0] = '\0';		/* Unset */
3753 	c->x86_hwcap = 0;
3754 #ifdef CPU_x86_64
3755 	c->x86_clflush_size = 64;
3756 #else
3757 	c->x86_clflush_size = 32;
3758 #endif
3759 
3760 	/* Get vendor name */
3761 	c->x86_vendor_id[12] = '\0';
3762 	cpuid(0x00000000,
3763 		(uae_u32 *)&c->cpuid_level,
3764 		(uae_u32 *)&c->x86_vendor_id[0],
3765 		(uae_u32 *)&c->x86_vendor_id[8],
3766 		(uae_u32 *)&c->x86_vendor_id[4]);
3767 	x86_get_cpu_vendor(c);
3768 
3769 	/* Intel-defined flags: level 0x00000001 */
3770 	c->x86_brand_id = 0;
3771 	if ( c->cpuid_level >= 0x00000001 ) {
3772 		uae_u32 tfms, brand_id;
3773 		cpuid(0x00000001, &tfms, &brand_id, &dummy, &c->x86_hwcap);
3774 		c->x86 = (tfms >> 8) & 15;
3775 		if (c->x86 == 0xf)
3776 			c->x86 += (tfms >> 20) & 0xff; /* extended family */
3777 		c->x86_model = (tfms >> 4) & 15;
3778 		if (c->x86_model == 0xf)
3779 			c->x86_model |= (tfms >> 12) & 0xf0; /* extended model */
3780 		c->x86_brand_id = brand_id & 0xff;
3781 		c->x86_mask = tfms & 15;
3782 		if (c->x86_hwcap & (1 << 19))
3783 		{
3784 			c->x86_clflush_size = ((brand_id >> 8) & 0xff) * 8;
3785 		}
3786 	} else {
3787 		/* Have CPUID level 0 only - unheard of */
3788 		c->x86 = 4;
3789 	}
3790 
3791 	/* AMD-defined flags: level 0x80000001 */
3792 	uae_u32 xlvl;
3793 	cpuid(0x80000000, &xlvl, &dummy, &dummy, &dummy);
3794 	if ( (xlvl & 0xffff0000) == 0x80000000 ) {
3795 		if ( xlvl >= 0x80000001 ) {
3796 			uae_u32 features, extra_features;
3797 			cpuid(0x80000001, &dummy, &dummy, &extra_features, &features);
3798 			if (features & (1 << 29)) {
3799 				/* Assume x86-64 if long mode is supported */
3800 				c->x86_processor = X86_PROCESSOR_X86_64;
3801 			}
3802 			if (extra_features & (1 << 0))
3803 				have_lahf_lm = true;
3804 		}
3805 	}
3806 
3807 	/* Canonicalize processor ID */
3808 	switch (c->x86) {
3809 	case 3:
3810 		c->x86_processor = X86_PROCESSOR_I386;
3811 		break;
3812 	case 4:
3813 		c->x86_processor = X86_PROCESSOR_I486;
3814 		break;
3815 	case 5:
3816 		if (c->x86_vendor == X86_VENDOR_AMD)
3817 			c->x86_processor = X86_PROCESSOR_K6;
3818 		else
3819 			c->x86_processor = X86_PROCESSOR_PENTIUM;
3820 		break;
3821 	case 6:
3822 		if (c->x86_vendor == X86_VENDOR_AMD)
3823 			c->x86_processor = X86_PROCESSOR_ATHLON;
3824 		else
3825 			c->x86_processor = X86_PROCESSOR_PENTIUMPRO;
3826 		break;
3827 	case 15:
3828 		if (c->x86_processor == X86_PROCESSOR_max) {
3829 			switch (c->x86_vendor) {
3830 			case X86_VENDOR_INTEL:
3831 				c->x86_processor = X86_PROCESSOR_PENTIUM4;
3832 				break;
3833 			case X86_VENDOR_AMD:
3834 				/* Assume a 32-bit Athlon processor if not in long mode */
3835 				c->x86_processor = X86_PROCESSOR_ATHLON;
3836 				break;
3837 			}
3838 		}
3839 		break;
3840 	}
3841 	if (c->x86_processor == X86_PROCESSOR_max) {
3842 		c->x86_processor = X86_PROCESSOR_I386;
3843 		jit_log("Error: unknown processor type");
3844 		jit_log("  Family  : %d", c->x86);
3845 		jit_log("  Model   : %d", c->x86_model);
3846 		jit_log("  Mask    : %d", c->x86_mask);
3847 		jit_log("  Vendor  : %s [%d]", c->x86_vendor_id, c->x86_vendor);
3848 		if (c->x86_brand_id)
3849 			jit_log("  BrandID : %02x", c->x86_brand_id);
3850 	}
3851 
3852 	/* Have CMOV support? */
3853 	have_cmov = (c->x86_hwcap & (1 << 15)) != 0;
3854 #if defined(CPU_x86_64)
3855 	if (!have_cmov) {
3856 		jit_abort("x86-64 implementations are bound to have CMOV!");
3857 	}
3858 #endif
3859 
3860 	c->x86_has_xmm2 = (c->x86_hwcap & (1 << 26)) != 0;
3861 
3862 	/* Can the host CPU suffer from partial register stalls? */
3863 	// non-RAT_STALL mode is currently broken
3864 	have_rat_stall = true; //(c->x86_vendor == X86_VENDOR_INTEL);
3865 #if 0
3866 	/* It appears that partial register writes are a bad idea even on
3867 	AMD K7 cores, even though they are not supposed to have the
3868 	dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */
3869 	if (c->x86_processor == X86_PROCESSOR_ATHLON)
3870 		have_rat_stall = true;
3871 #endif
3872 
3873 	/* Alignments */
3874 	if (tune_alignment) {
3875 		align_loops = x86_alignments[c->x86_processor].align_loop;
3876 		align_jumps = x86_alignments[c->x86_processor].align_jump;
3877 	}
3878 
3879 	jit_log("Max CPUID level=%d Processor is %s [%s]",
3880 			c->cpuid_level, c->x86_vendor_id,
3881 			x86_processor_string_table[c->x86_processor]);
3882 
3883 	raw_flags_init();
3884 }
3885 
3886 #if 0
3887 static void __attribute_noinline__ prevent_redzone_use(void) {}
3888 
3889 static bool target_check_bsf(void)
3890 {
3891 	bool mismatch = false;
3892 	for (int g_ZF = 0; g_ZF <= 1; g_ZF++) {
3893 		for (int g_CF = 0; g_CF <= 1; g_CF++) {
3894 			for (int g_OF = 0; g_OF <= 1; g_OF++) {
3895 				for (int g_SF = 0; g_SF <= 1; g_SF++) {
3896 					for (int value = -1; value <= 1; value++) {
3897 						uintptr flags = (g_SF << 7) | (g_OF << 11) | (g_ZF << 6) | g_CF;
3898 						intptr tmp = value;
3899 						prevent_redzone_use();
3900 						__asm__ __volatile__ ("push %0; popf; bsf %1,%1; pushf; pop %0"
3901 							: "+r" (flags), "+r" (tmp) : : "cc");
3902 						int OF = (flags >> 11) & 1;
3903 						int SF = (flags >>  7) & 1;
3904 						int ZF = (flags >>  6) & 1;
3905 						int CF = flags & 1;
3906 						tmp = (value == 0);
3907 						if (ZF != tmp || SF != g_SF || OF != g_OF || CF != g_CF)
3908 							mismatch = true;
3909 					}
3910 				}}}}
3911 	if (mismatch)
3912 	{
3913 		jit_log("Target CPU defines all flags on BSF instruction");
3914 	}
3915 	return !mismatch;
3916 }
3917 #endif
3918 
3919 /*************************************************************************
3920 * FPU stuff                                                             *
3921 *************************************************************************/
3922 
3923 
raw_fp_init(void)3924 static inline void raw_fp_init(void)
3925 {
3926 	int i;
3927 
3928 	for (i=0;i<N_FREGS;i++)
3929 		live.spos[i]=-2;
3930 	live.tos=-1;  /* Stack is empty */
3931 }
3932 
raw_fp_cleanup_drop(void)3933 static inline void raw_fp_cleanup_drop(void)
3934 {
3935 #if 0
3936 	/* using FINIT instead of popping all the entries.
3937 	Seems to have side effects --- there is display corruption in
3938 	Quake when this is used */
3939 	if (live.tos>1) {
3940 		emit_byte(0x9b);
3941 		emit_byte(0xdb);
3942 		emit_byte(0xe3);
3943 		live.tos=-1;
3944 	}
3945 #endif
3946 	while (live.tos>=1) {
3947 		emit_byte(0xde);
3948 		emit_byte(0xd9);
3949 		live.tos-=2;
3950 	}
3951 	while (live.tos>=0) {
3952 		emit_byte(0xdd);
3953 		emit_byte(0xd8);
3954 		live.tos--;
3955 	}
3956 	raw_fp_init();
3957 }
3958 
make_tos(int r)3959 static inline void make_tos(int r)
3960 {
3961 	int p,q;
3962 
3963 	if (live.spos[r]<0) { /* Register not yet on stack */
3964 		emit_byte(0xd9);
3965 		emit_byte(0xe8);  /* Push '1' on the stack, just to grow it */
3966 		live.tos++;
3967 		live.spos[r]=live.tos;
3968 		live.onstack[live.tos]=r;
3969 		return;
3970 	}
3971 	/* Register is on stack */
3972 	if (live.tos==live.spos[r])
3973 		return;
3974 	p=live.spos[r];
3975 	q=live.onstack[live.tos];
3976 
3977 	emit_byte(0xd9);
3978 	emit_byte(0xc8+live.tos-live.spos[r]);  /* exchange it with top of stack */
3979 	live.onstack[live.tos]=r;
3980 	live.spos[r]=live.tos;
3981 	live.onstack[p]=q;
3982 	live.spos[q]=p;
3983 }
3984 
make_tos2(int r,int r2)3985 static inline void make_tos2(int r, int r2)
3986 {
3987     int q;
3988 
3989     make_tos(r2); /* Put the reg that's supposed to end up in position2
3990 		     on top */
3991 
3992     if (live.spos[r]<0) { /* Register not yet on stack */
3993 	make_tos(r); /* This will extend the stack */
3994 	return;
3995     }
3996     /* Register is on stack */
3997     emit_byte(0xd9);
3998     emit_byte(0xc9); /* Move r2 into position 2 */
3999 
4000     q=live.onstack[live.tos-1];
4001     live.onstack[live.tos]=q;
4002     live.spos[q]=live.tos;
4003     live.onstack[live.tos-1]=r2;
4004     live.spos[r2]=live.tos-1;
4005 
4006     make_tos(r); /* And r into 1 */
4007 }
4008 
stackpos(int r)4009 static inline int stackpos(int r)
4010 {
4011 	if (live.spos[r]<0)
4012 		abort();
4013 	if (live.tos<live.spos[r]) {
4014 		jit_abort("Looking for spos for fnreg %d",r);
4015 	}
4016 	return live.tos-live.spos[r];
4017 }
4018 
4019 /* IMO, calling usereg(r) makes no sense, if the register r should supply our function with
4020 an argument, because I would expect all arguments to be on the stack already, won't they?
4021 Thus, usereg(s) is always useless and also for every FRW d it's too late here now. PeterK
4022 */
usereg(int r)4023 static inline void usereg(int r)
4024 {
4025 	if (live.spos[r]<0)
4026 		make_tos(r);
4027 }
4028 
4029 /* This is called with one FP value in a reg *above* tos, which it will
4030    pop off the stack if necessary */
tos_make(int r)4031 static inline void tos_make(int r)
4032 {
4033 	if (live.spos[r]<0) {
4034 		live.tos++;
4035 		live.spos[r]=live.tos;
4036 		live.onstack[live.tos]=r;
4037 		return;
4038 	}
4039 	emit_byte(0xdd);
4040 	emit_byte(0xd8+(live.tos+1)-live.spos[r]);  /* store top of stack in reg,
4041 						       and pop it*/
4042 }
4043 
4044 /* FP helper functions */
4045 #if USE_NEW_RTASM
4046 #define DEFINE_OP(NAME, GEN)			\
4047 static inline void raw_##NAME(uint32 m)		\
4048 {						\
4049     GEN(m, X86_NOREG, X86_NOREG, 1);		\
4050 }
4051 DEFINE_OP(fstl,  FSTLm);
4052 DEFINE_OP(fstpl, FSTPLm);
4053 DEFINE_OP(fldl,  FLDLm);
4054 DEFINE_OP(fildl, FILDLm);
4055 DEFINE_OP(fistl, FISTLm);
4056 DEFINE_OP(flds,  FLDSm);
4057 DEFINE_OP(fsts,  FSTSm);
4058 DEFINE_OP(fstpt, FSTPTm);
4059 DEFINE_OP(fldt,  FLDTm);
4060 DEFINE_OP(fistpl, FISTPLm);
4061 #else
4062 #define DEFINE_OP(NAME, OP1, OP2)		\
4063 static inline void raw_##NAME(uint32 m)		\
4064 {						\
4065     emit_byte(OP1);				\
4066     emit_byte(OP2);				\
4067     emit_long(m);				\
4068 }
4069 DEFINE_OP(fstl,  0xdd, 0x15);
4070 DEFINE_OP(fstpl, 0xdd, 0x1d);
4071 DEFINE_OP(fldl,  0xdd, 0x05);
4072 DEFINE_OP(fildl, 0xdb, 0x05);
4073 DEFINE_OP(fistl, 0xdb, 0x15);
4074 DEFINE_OP(flds,  0xd9, 0x05);
4075 DEFINE_OP(fsts,  0xd9, 0x15);
4076 DEFINE_OP(fstpt, 0xdb, 0x3d);
4077 DEFINE_OP(fldt,  0xdb, 0x2d);
4078 DEFINE_OP(fistpl, 0xdb, 0x1d);
4079 #endif
4080 #undef DEFINE_OP
4081 
4082 LOWFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW m, FR r))
4083 {
4084 	make_tos(r);
4085 	raw_fstl(m);
4086 }
4087 LENDFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW m, FR r))
4088 
4089 LOWFUNC(NONE,WRITE,2,raw_fmov_mr_drop,(MEMW m, FR r))
4090 {
4091 	make_tos(r);
4092 	raw_fstpl(m);
4093 	live.onstack[live.tos]=-1;
4094 	live.tos--;
4095 	live.spos[r]=-2;
4096 }
4097 LENDFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW m, FR r))
4098 
4099 LOWFUNC(NONE,READ,2,raw_fmov_rm,(FW r, MEMR m))
4100 {
4101 	raw_fldl(m);
4102 	tos_make(r);
4103 }
4104 LENDFUNC(NONE,READ,2,raw_fmov_rm,(FW r, MEMR m))
4105 
4106 LOWFUNC(NONE,READ,2,raw_fmovi_rm,(FW r, MEMR m))
4107 {
4108 	raw_fildl(m);
4109 	tos_make(r);
4110 }
4111 LENDFUNC(NONE,READ,2,raw_fmovi_rm,(FW r, MEMR m))
4112 
4113 LOWFUNC(NONE,WRITE,2,raw_fmovi_mr,(MEMW m, FR r))
4114 {
4115 	make_tos(r);
4116 	raw_fistl(m);
4117 }
4118 LENDFUNC(NONE,WRITE,2,raw_fmovi_mr,(MEMW m, FR r))
4119 
4120 LOWFUNC(NONE,WRITE,3,raw_fmovi_mrb,(MEMW m, FR r, double *bounds))
4121 {
4122 	/* Clamp value to the given range and convert to integer. */
4123 
4124 	int rs;
4125 	usereg(r);
4126 	rs = stackpos(r)+1;
4127 
4128 	/* Lower bound onto stack */
4129 	raw_fldl((uintptr) &bounds[0]); /* fld double from lower */
4130 
4131 	/* Clamp to lower */
4132 	emit_byte(0xdb);
4133 	emit_byte(0xf0+rs); /* fcomi lower,r */
4134 	emit_byte(0x73);
4135 	emit_byte(12);      /* jae to writeback */
4136 
4137 	/* Upper bound onto stack */
4138 	emit_byte(0xdd);
4139 	emit_byte(0xd8);	/* fstp st(0) */
4140 	raw_fldl((uintptr) &bounds[1]); /* fld double from upper */
4141 
4142 	/* Clamp to upper */
4143 	emit_byte(0xdb);
4144 	emit_byte(0xf0+rs); /* fcomi upper,r */
4145 	emit_byte(0xdb);
4146 	emit_byte(0xd0+rs); /* fcmovnbe upper,r */
4147 
4148 	/* Store to destination */
4149 	raw_fistpl(m);
4150 }
4151 LENDFUNC(NONE,WRITE,3,raw_fmovi_mrb,(MEMW m, FR r, double *bounds))
4152 
4153 LOWFUNC(NONE,READ,2,raw_fmovs_rm,(FW r, MEMR m))
4154 {
4155 	raw_flds(m);
4156 	tos_make(r);
4157 }
4158 LENDFUNC(NONE,READ,2,raw_fmovs_rm,(FW r, MEMR m))
4159 
4160 LOWFUNC(NONE,WRITE,2,raw_fmovs_mr,(MEMW m, FR r))
4161 {
4162 	make_tos(r);
4163 	raw_fsts(m);
4164 }
4165 LENDFUNC(NONE,WRITE,2,raw_fmovs_mr,(MEMW m, FR r))
4166 
4167 LOWFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(MEMW m, FR r))
4168 {
4169 	int rs;
4170 
4171 	/* Stupid x87 can't write a long double to mem without popping the
4172 	stack! */
4173 	usereg(r);
4174 	rs=stackpos(r);
4175 	emit_byte(0xd9);     /* Get a copy to the top of stack */
4176 	emit_byte(0xc0+rs);
4177 
4178 	raw_fstpt(m);	/* store and pop it */
4179 }
4180 LENDFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(MEMW m, FR r))
4181 
4182 LOWFUNC(NONE,WRITE,2,raw_fmov_ext_mr_drop,(MEMW m, FR r))
4183 {
4184 	make_tos(r);
4185 	raw_fstpt(m);	/* store and pop it */
4186 	live.onstack[live.tos]=-1;
4187 	live.tos--;
4188 	live.spos[r]=-2;
4189 }
4190 LENDFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(MEMW m, FR r))
4191 
4192 LOWFUNC(NONE,READ,2,raw_fmov_ext_rm,(FW r, MEMR m))
4193 {
4194 	raw_fldt(m);
4195 	tos_make(r);
4196 }
4197 LENDFUNC(NONE,READ,2,raw_fmov_ext_rm,(FW r, MEMR m))
4198 
4199 LOWFUNC(NONE,NONE,1,raw_fmov_pi,(FW r))
4200 {
4201 	emit_byte(0xd9);
4202 	emit_byte(0xeb);
4203 	tos_make(r);
4204 }
4205 LENDFUNC(NONE,NONE,1,raw_fmov_pi,(FW r))
4206 
4207 LOWFUNC(NONE,NONE,1,raw_fmov_log10_2,(FW r))
4208 {
4209 	emit_byte(0xd9);
4210 	emit_byte(0xec);
4211 	tos_make(r);
4212 }
4213 LENDFUNC(NONE,NONE,1,raw_fmov_log10_2,(FW r))
4214 
4215 LOWFUNC(NONE,NONE,1,raw_fmov_log2_e,(FW r))
4216 {
4217 	emit_byte(0xd9);
4218 	emit_byte(0xea);
4219 	tos_make(r);
4220 }
4221 LENDFUNC(NONE,NONE,1,raw_fmov_log2_e,(FW r))
4222 
4223 LOWFUNC(NONE,NONE,1,raw_fmov_loge_2,(FW r))
4224 {
4225 	emit_byte(0xd9);
4226 	emit_byte(0xed);
4227 	tos_make(r);
4228 }
4229 LENDFUNC(NONE,NONE,1,raw_fmov_loge_2,(FW r))
4230 
4231 LOWFUNC(NONE,NONE,1,raw_fmov_1,(FW r))
4232 {
4233 	emit_byte(0xd9);
4234 	emit_byte(0xe8);
4235 	tos_make(r);
4236 }
4237 LENDFUNC(NONE,NONE,1,raw_fmov_1,(FW r))
4238 
4239 LOWFUNC(NONE,NONE,1,raw_fmov_0,(FW r))
4240 {
4241 	emit_byte(0xd9);
4242 	emit_byte(0xee);
4243 	tos_make(r);
4244 }
4245 LENDFUNC(NONE,NONE,1,raw_fmov_0,(FW r))
4246 
4247 LOWFUNC(NONE,NONE,2,raw_fmov_rr,(FW d, FR s))
4248 {
4249 	int ds;
4250 
4251 	usereg(s);
4252 	ds=stackpos(s);
4253 	if (ds==0 && live.spos[d]>=0) {
4254 		/* source is on top of stack, and we already have the dest */
4255 		int dd=stackpos(d);
4256 		emit_byte(0xdd);
4257 		emit_byte(0xd0+dd);
4258 	}
4259 	else {
4260 		emit_byte(0xd9);
4261 		emit_byte(0xc0+ds); /* duplicate source on tos */
4262 		tos_make(d); /* store to destination, pop if necessary */
4263 	}
4264 }
4265 LENDFUNC(NONE,NONE,2,raw_fmov_rr,(FW d, FR s))
4266 
4267 LOWFUNC(NONE,READ,2,raw_fldcw_m_indexed,(R4 index, IMM base))
4268 {
4269 	x86_64_prefix(true, false, NULL, NULL, &index);
4270 	emit_byte(0xd9);
4271 	emit_byte(0xa8 + index);
4272 	emit_long(base);
4273 }
4274 LENDFUNC(NONE,READ,2,raw_fldcw_m_indexed,(R4 index, IMM base))
4275 
4276 LOWFUNC(NONE,NONE,2,raw_fsqrt_rr,(FW d, FR s))
4277 {
4278 	int ds;
4279 
4280 	if (d!=s) {
4281 		usereg(s);
4282 		ds=stackpos(s);
4283 		emit_byte(0xd9);
4284 		emit_byte(0xc0+ds); /* duplicate source */
4285 		emit_byte(0xd9);
4286 		emit_byte(0xfa); /* take square root */
4287 		tos_make(d);        /* store to destination */
4288 	}
4289 	else {
4290 		make_tos(d);
4291 		emit_byte(0xd9);
4292 		emit_byte(0xfa);    /* take square root */
4293 	}
4294 }
4295 LENDFUNC(NONE,NONE,2,raw_fsqrt_rr,(FW d, FR s))
4296 
4297 LOWFUNC(NONE,NONE,2,raw_fabs_rr,(FW d, FR s))
4298 {
4299 	int ds;
4300 
4301 	if (d!=s) {
4302 		usereg(s);
4303 		ds=stackpos(s);
4304 		emit_byte(0xd9);
4305 		emit_byte(0xc0+ds); /* duplicate source */
4306 		emit_byte(0xd9);
4307 		emit_byte(0xe1); /* take fabs */
4308 		tos_make(d);        /* store to destination */
4309 	}
4310 	else {
4311 		make_tos(d);
4312 		emit_byte(0xd9);
4313 		emit_byte(0xe1); /* take fabs */
4314 	}
4315 }
4316 LENDFUNC(NONE,NONE,2,raw_fabs_rr,(FW d, FR s))
4317 
4318 LOWFUNC(NONE,NONE,2,raw_frndint_rr,(FW d, FR s))
4319 {
4320 	int ds;
4321 
4322 	if (d!=s) {
4323 		usereg(s);
4324 		ds=stackpos(s);
4325 		emit_byte(0xd9);
4326 		emit_byte(0xc0+ds); /* duplicate source */
4327 		emit_byte(0xd9);
4328 		emit_byte(0xfc); /* take frndint */
4329 		tos_make(d);        /* store to destination */
4330 	}
4331 	else {
4332 		make_tos(d);
4333 		emit_byte(0xd9);
4334 		emit_byte(0xfc); /* take frndint */
4335 	}
4336 }
4337 LENDFUNC(NONE,NONE,2,raw_frndint_rr,(FW d, FR s))
4338 
4339 LOWFUNC(NONE,NONE,2,raw_fcos_rr,(FW d, FR s))
4340 {
4341 	int ds;
4342 
4343 	if (d!=s) {
4344 		usereg(s);
4345 		ds=stackpos(s);
4346 		emit_byte(0xd9);
4347 		emit_byte(0xc0+ds); /* duplicate source */
4348 		emit_byte(0xd9);
4349 		emit_byte(0xff);    /* take cos */
4350 		tos_make(d);        /* store to destination */
4351 	}
4352 	else {
4353 		make_tos(d);
4354 		emit_byte(0xd9);
4355 		emit_byte(0xff);    /* take cos */
4356 	}
4357 }
4358 LENDFUNC(NONE,NONE,2,raw_fcos_rr,(FW d, FR s))
4359 
4360 LOWFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, FR s))
4361 {
4362 	int ds;
4363 
4364 	if (d!=s) {
4365 		ds=stackpos(s);
4366 		emit_byte(0xd9);
4367 		emit_byte(0xc0+ds); /* fld x */
4368 		emit_byte(0xd9);
4369 		emit_byte(0xfe);    /* fsin sin(x) */
4370 		tos_make(d);        /* store to destination */
4371 	}
4372 	else {
4373 		make_tos(d);
4374 		emit_byte(0xd9);
4375 		emit_byte(0xfe);    /* fsin y=sin(x) */
4376 	}
4377 }
4378 LENDFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, FR s))
4379 
4380 static const double one = 1;
4381 
4382 LOWFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d, FR s))
4383 {
4384 	int ds;
4385 
4386 	ds=stackpos(s);
4387 	emit_byte(0xd9);
4388 	emit_byte(0xc0+ds); /* fld x */
4389 	emit_byte(0xd9);
4390 	emit_byte(0xfc);    /* frndint int(x) */
4391 	emit_byte(0xd9);
4392 	emit_byte(0xc1+ds); /* fld x again */
4393 	emit_byte(0xd8);
4394 	emit_byte(0xe1);    /* fsub frac(x) = x - int(x) */
4395 	emit_byte(0xd9);
4396 	emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
4397 	x86_fadd_m((uintptr) &one); /* Add '1' without using extra stack space */
4398 	emit_byte(0xd9);
4399 	emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x) */
4400 	emit_byte(0xdd);
4401 	emit_byte(0xd9);    /* fstp copy & pop */
4402 	tos_make(d);        /* store y=2^x */
4403 }
4404 LENDFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d, FR s))
4405 
4406 LOWFUNC(NONE,NONE,2,raw_fetox_rr,(FW d, FR s))
4407 {
4408 	int ds;
4409 
4410 	if (s==d)
4411 		make_tos(s);
4412 	else {
4413 		ds=stackpos(s);
4414 		emit_byte(0xd9);
4415 		emit_byte(0xc0+ds); /* duplicate source */
4416 	}
4417 	emit_byte(0xd9);
4418 	emit_byte(0xea);    /* fldl2e log2(e) */
4419 	emit_byte(0xd8);
4420 	emit_byte(0xc9);    /* fmul x*log2(e) */
4421 	emit_byte(0xdd);
4422 	emit_byte(0xd1);    /* fst copy up */
4423 	emit_byte(0xd9);
4424 	emit_byte(0xfc);    /* frndint int(x*log2(e)) */
4425 	emit_byte(0xd9);
4426 	emit_byte(0xc9);    /* fxch swap top two elements */
4427 	emit_byte(0xd8);
4428 	emit_byte(0xe1);    /* fsub x*log2(e) - int(x*log2(e))  */
4429 	emit_byte(0xd9);
4430 	emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
4431 	x86_fadd_m((uintptr) &one); /* Add '1' without using extra stack space */
4432 	emit_byte(0xd9);
4433 	emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x*log2(e)) */
4434 	emit_byte(0xdd);
4435 	emit_byte(0xd9);    /* fstp copy & pop */
4436 	if (s!=d)
4437 		tos_make(d);    /* store y=e^x */
4438 }
4439 LENDFUNC(NONE,NONE,2,raw_fetox_rr,(FW d, FR s))
4440 
4441 LOWFUNC(NONE,NONE,2,raw_flog2_rr,(FW d, FR s))
4442 {
4443 	int ds;
4444 
4445 	if (s==d)
4446 		make_tos(s);
4447 	else {
4448 		ds=stackpos(s);
4449 		emit_byte(0xd9);
4450 		emit_byte(0xc0+ds); /* duplicate source */
4451 	}
4452 	emit_byte(0xd9);
4453 	emit_byte(0xe8);    /* push '1' */
4454 	emit_byte(0xd9);
4455 	emit_byte(0xc9);    /* swap top two */
4456 	emit_byte(0xd9);
4457 	emit_byte(0xf1);    /* take 1*log2(x) */
4458 	if (s!=d)
4459 		tos_make(d);    /* store to destination */
4460 }
4461 LENDFUNC(NONE,NONE,2,raw_flog2_rr,(FW d, FR s))
4462 
4463 
4464 LOWFUNC(NONE,NONE,2,raw_fneg_rr,(FW d, FR s))
4465 {
4466 	int ds;
4467 
4468 	if (d!=s) {
4469 		usereg(s);
4470 		ds=stackpos(s);
4471 		emit_byte(0xd9);
4472 		emit_byte(0xc0+ds); /* duplicate source */
4473 		emit_byte(0xd9);
4474 		emit_byte(0xe0); /* take fchs */
4475 		tos_make(d); /* store to destination */
4476 	}
4477 	else {
4478 		make_tos(d);
4479 		emit_byte(0xd9);
4480 		emit_byte(0xe0); /* take fchs */
4481 	}
4482 }
4483 LENDFUNC(NONE,NONE,2,raw_fneg_rr,(FW d, FR s))
4484 
4485 LOWFUNC(NONE,NONE,2,raw_fadd_rr,(FRW d, FR s))
4486 {
4487 	int ds;
4488 
4489 	usereg(s);
4490 	usereg(d);
4491 
4492 	if (live.spos[s]==live.tos) {
4493 		/* Source is on top of stack */
4494 		ds=stackpos(d);
4495 		emit_byte(0xdc);
4496 		emit_byte(0xc0+ds); /* add source to dest*/
4497 	}
4498 	else {
4499 		make_tos(d);
4500 		ds=stackpos(s);
4501 
4502 		emit_byte(0xd8);
4503 		emit_byte(0xc0+ds); /* add source to dest*/
4504 	}
4505 }
4506 LENDFUNC(NONE,NONE,2,raw_fadd_rr,(FRW d, FR s))
4507 
4508 LOWFUNC(NONE,NONE,2,raw_fsub_rr,(FRW d, FR s))
4509 {
4510 	int ds;
4511 
4512 	usereg(s);
4513 	usereg(d);
4514 
4515 	if (live.spos[s]==live.tos) {
4516 		/* Source is on top of stack */
4517 		ds=stackpos(d);
4518 		emit_byte(0xdc);
4519 		emit_byte(0xe8+ds); /* sub source from dest*/
4520 	}
4521 	else {
4522 		make_tos(d);
4523 		ds=stackpos(s);
4524 
4525 		emit_byte(0xd8);
4526 		emit_byte(0xe0+ds); /* sub src from dest */
4527 	}
4528 }
4529 LENDFUNC(NONE,NONE,2,raw_fsub_rr,(FRW d, FR s))
4530 
4531 LOWFUNC(NONE,NONE,2,raw_fcmp_rr,(FR d, FR s))
4532 {
4533 	int ds;
4534 
4535 	usereg(s);
4536 	usereg(d);
4537 
4538 	make_tos(d);
4539 	ds=stackpos(s);
4540 
4541 	emit_byte(0xdd);
4542 	emit_byte(0xe0+ds); /* cmp dest with source*/
4543 }
4544 LENDFUNC(NONE,NONE,2,raw_fcmp_rr,(FR d, FR s))
4545 
4546 LOWFUNC(NONE,NONE,2,raw_fmul_rr,(FRW d, FR s))
4547 {
4548 	int ds;
4549 
4550 	usereg(s);
4551 	usereg(d);
4552 
4553 	if (live.spos[s]==live.tos) {
4554 		/* Source is on top of stack */
4555 		ds=stackpos(d);
4556 		emit_byte(0xdc);
4557 		emit_byte(0xc8+ds); /* mul dest by source*/
4558 	}
4559 	else {
4560 		make_tos(d);
4561 		ds=stackpos(s);
4562 
4563 		emit_byte(0xd8);
4564 		emit_byte(0xc8+ds); /* mul dest by source*/
4565 	}
4566 }
4567 LENDFUNC(NONE,NONE,2,raw_fmul_rr,(FRW d, FR s))
4568 
4569 LOWFUNC(NONE,NONE,2,raw_fdiv_rr,(FRW d, FR s))
4570 {
4571 	int ds;
4572 
4573 	usereg(s);
4574 	usereg(d);
4575 
4576 	if (live.spos[s]==live.tos) {
4577 		/* Source is on top of stack */
4578 		ds=stackpos(d);
4579 		emit_byte(0xdc);
4580 		emit_byte(0xf8+ds); /* div dest by source */
4581 	}
4582 	else {
4583 		make_tos(d);
4584 		ds=stackpos(s);
4585 
4586 		emit_byte(0xd8);
4587 		emit_byte(0xf0+ds); /* div dest by source*/
4588 	}
4589 }
4590 LENDFUNC(NONE,NONE,2,raw_fdiv_rr,(FRW d, FR s))
4591 
4592 LOWFUNC(NONE,NONE,2,raw_frem_rr,(FRW d, FR s))
4593 {
4594 	int ds;
4595 
4596 	usereg(s);
4597 	usereg(d);
4598 
4599 	make_tos2(d,s);
4600 	ds=stackpos(s);
4601 
4602 	if (ds!=1) {
4603 		printf("Failed horribly in raw_frem_rr! ds is %d\n",ds);
4604 		abort();
4605 	}
4606 	emit_byte(0xd9);
4607 	emit_byte(0xf8); /* take rem from dest by source */
4608 }
4609 LENDFUNC(NONE,NONE,2,raw_frem_rr,(FRW d, FR s))
4610 
4611 LOWFUNC(NONE,NONE,2,raw_frem1_rr,(FRW d, FR s))
4612 {
4613 	int ds;
4614 
4615 	usereg(s);
4616 	usereg(d);
4617 
4618 	make_tos2(d,s);
4619 	ds=stackpos(s);
4620 
4621 	if (ds!=1) {
4622 		printf("Failed horribly in raw_frem1_rr! ds is %d\n",ds);
4623 		abort();
4624 	}
4625 	emit_byte(0xd9);
4626 	emit_byte(0xf5); /* take rem1 from dest by source */
4627 }
4628 LENDFUNC(NONE,NONE,2,raw_frem1_rr,(FRW d, FR s))
4629 
4630 
4631 LOWFUNC(NONE,NONE,1,raw_ftst_r,(FR r))
4632 {
4633 	make_tos(r);
4634 	emit_byte(0xd9);  /* ftst */
4635 	emit_byte(0xe4);
4636 }
4637 LENDFUNC(NONE,NONE,1,raw_ftst_r,(FR r))
4638 
4639 LOWFUNC(NONE,NONE,2,raw_fetoxM1_rr,(FW d, FR s))
4640 {
4641 	int ds;
4642 
4643 	if (s==d)
4644 		make_tos(s);
4645 	else {
4646 		ds=stackpos(s);
4647 		emit_byte(0xd9);
4648 		emit_byte(0xc0+ds); /* fld x */
4649 	}
4650 	emit_byte(0xd9);
4651 	emit_byte(0xea);    /* fldl2e log2(e) */
4652 	emit_byte(0xd8);
4653 	emit_byte(0xc9);    /* fmul x*log2(e) */
4654 	emit_byte(0xdd);
4655 	emit_byte(0xd1);    /* fst copy up */
4656 	emit_byte(0xd9);
4657 	emit_byte(0xfc);    /* frndint int(x*log2(e)) */
4658 	emit_byte(0xd9);
4659 	emit_byte(0xc9);    /* fxch swap top two elements */
4660 	emit_byte(0xd8);
4661 	emit_byte(0xe1);    /* fsub x*log2(e) - int(x*log2(e))  */
4662 	emit_byte(0xd9);
4663 	emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
4664 	emit_byte(0xd9);
4665 	emit_byte(0xfd);    /* fscale ((2^frac(x))-1)*2^int(x*log2(e)) */
4666 	emit_byte(0xdd);
4667 	emit_byte(0xd9);    /* fstp copy & pop */
4668 	if (s!=d)
4669 		tos_make(d);    /* store y=(e^x)-1 */
4670 }
4671 LENDFUNC(NONE,NONE,2,raw_fetoxM1_rr,(FW d, FR s))
4672 
4673 LOWFUNC(NONE,NONE,2,raw_ftentox_rr,(FW d, FR s))
4674 {
4675 	int ds;
4676 
4677 	if (s==d)
4678 		make_tos(s);
4679 	else {
4680 		ds=stackpos(s);
4681 		emit_byte(0xd9);
4682 		emit_byte(0xc0+ds); /* fld x */
4683 	}
4684 	emit_byte(0xd9);
4685 	emit_byte(0xe9);    /* fldl2t log2(10) */
4686 	emit_byte(0xd8);
4687 	emit_byte(0xc9);    /* fmul x*log2(10) */
4688 	emit_byte(0xdd);
4689 	emit_byte(0xd1);    /* fst copy up */
4690 	emit_byte(0xd9);
4691 	emit_byte(0xfc);    /* frndint int(x*log2(10)) */
4692 	emit_byte(0xd9);
4693 	emit_byte(0xc9);    /* fxch swap top two elements */
4694 	emit_byte(0xd8);
4695 	emit_byte(0xe1);    /* fsub x*log2(10) - int(x*log2(10))  */
4696 	emit_byte(0xd9);
4697 	emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
4698 	x86_fadd_m((uintptr) &one);
4699 	emit_byte(0xd9);
4700 	emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x*log2(10)) */
4701 	emit_byte(0xdd);
4702 	emit_byte(0xd9);    /* fstp copy & pop */
4703 	if (s!=d)
4704 		tos_make(d);    /* store y=10^x */
4705 }
4706 LENDFUNC(NONE,NONE,2,raw_ftentox_rr,(FW d, FR s))
4707 
4708 LOWFUNC(NONE,NONE,3,raw_fsincos_rr,(FW d, FW c, FR s))
4709 {
4710 	int ds;
4711 
4712 	if (s==d) {
4713 		//write_log (_T("FSINCOS src = dest\n"));
4714 		make_tos(s);
4715 		emit_byte(0xd9);
4716 		emit_byte(0xfb); /* fsincos sin(x) push cos(x) */
4717 		tos_make(c);     /* store cos(x) to c */
4718 		return;
4719 	}
4720 
4721 	ds=stackpos(s);
4722 	emit_byte(0xd9);
4723 	emit_byte(0xc0+ds);  /* fld x */
4724 	emit_byte(0xd9);
4725 	emit_byte(0xfb);     /* fsincos sin(x) push cos(x) */
4726 	if (live.spos[c]<0) {
4727 		if (live.spos[d]<0) { /* occupy both regs directly */
4728 			live.tos++;
4729 			live.spos[d]=live.tos;
4730 			live.onstack[live.tos]=d; /* sin(x) comes first */
4731 			live.tos++;
4732 			live.spos[c]=live.tos;
4733 			live.onstack[live.tos]=c;
4734 		}
4735 		else {
4736 			emit_byte(0xd9);
4737 			emit_byte(0xc9); /* fxch swap cos(x) with sin(x) */
4738 			emit_byte(0xdd); /* store sin(x) to d & pop */
4739 			emit_byte(0xd8+(live.tos+2)-live.spos[d]);
4740 			live.tos++;      /* occupy a reg for cos(x) here */
4741 			live.spos[c]=live.tos;
4742 			live.onstack[live.tos]=c;
4743 		}
4744 	}
4745 	else {
4746 		emit_byte(0xdd); /* store cos(x) to c & pop */
4747 		emit_byte(0xd8+(live.tos+2)-live.spos[c]);
4748 		tos_make(d);     /* store sin(x) to destination */
4749 	}
4750 }
4751 LENDFUNC(NONE,NONE,3,raw_fsincos_rr,(FW d, FW c, FR s))
4752 
4753 LOWFUNC(NONE,NONE,2,raw_fscale_rr,(FRW d, FR s))
4754 {
4755 	int ds;
4756 
4757 	if (live.spos[d]==live.tos && live.spos[s]==live.tos-1) {
4758 		//write_log (_T("fscale found x in TOS-1 and y in TOS\n"));
4759 		emit_byte(0xd9);
4760 		emit_byte(0xfd);    /* fscale y*(2^x) */
4761 	}
4762 	else {
4763 		make_tos(s);        /* tos=x */
4764 		ds=stackpos(d);
4765 		emit_byte(0xd9);
4766 		emit_byte(0xc0+ds); /* fld y */
4767 		emit_byte(0xd9);
4768 		emit_byte(0xfd);    /* fscale y*(2^x) */
4769 		tos_make(d);        /* store y=y*(2^x) */
4770 	}
4771 }
4772 LENDFUNC(NONE,NONE,2,raw_fscale_rr,(FRW d, FR s))
4773 
4774 LOWFUNC(NONE,NONE,2,raw_ftan_rr,(FW d, FR s))
4775 {
4776 	int ds;
4777 
4778 	if (d!=s) {
4779 		ds=stackpos(s);
4780 		emit_byte(0xd9);
4781 		emit_byte(0xc0+ds); /* fld x */
4782 		emit_byte(0xd9);
4783 		emit_byte(0xf2);    /* fptan tan(x)=y/1.0 */
4784 		emit_byte(0xdd);
4785 		emit_byte(0xd8);    /* fstp pop 1.0 */
4786 		tos_make(d);        /* store to destination */
4787 	}
4788 	else {
4789 		make_tos(d);
4790 		emit_byte(0xd9);
4791 		emit_byte(0xf2);    /* fptan tan(x)=y/1.0 */
4792 		emit_byte(0xdd);
4793 		emit_byte(0xd8);    /* fstp pop 1.0 */
4794 	}
4795 }
4796 LENDFUNC(NONE,NONE,2,raw_ftan_rr,(FW d, FR s))
4797 
4798 #ifdef CPU_x86_64
4799 #define REX64 emit_byte(0x48);
4800 #else
4801 #define REX64
4802 #endif
4803 
4804 LOWFUNC(NONE,NONE,1,raw_fcuts_r,(FRW r))
4805 {
4806 	make_tos(r);     /* TOS = r */
4807 	REX64
4808 	emit_byte(0x83);
4809 	emit_byte(0xc4);
4810 	emit_byte(0xfc); /* add -4 to esp */
4811 	emit_byte(0xd9);
4812 	emit_byte(0x1c);
4813 	emit_byte(0x24); /* fstp store r as SINGLE to [esp] and pop */
4814 	emit_byte(0xd9);
4815 	emit_byte(0x04);
4816 	emit_byte(0x24); /* fld load r as SINGLE from [esp] */
4817 	emit_byte(0x9b); /* let the CPU wait on FPU exceptions */
4818 	REX64
4819 	emit_byte(0x83);
4820 	emit_byte(0xc4);
4821 	emit_byte(0x04); /* add +4 to esp */
4822 }
4823 LENDFUNC(NONE,NONE,1,raw_fcuts_r,(FRW r))
4824 
4825 LOWFUNC(NONE,NONE,1,raw_fcut_r,(FRW r))
4826 {
4827 	make_tos(r);     /* TOS = r */
4828 	REX64
4829 	emit_byte(0x83);
4830 	emit_byte(0xc4);
4831 	emit_byte(0xf8); /* add -8 to esp */
4832 	emit_byte(0xdd);
4833 	emit_byte(0x1c);
4834 	emit_byte(0x24); /* fstp store r as DOUBLE to [esp] and pop */
4835 	emit_byte(0xdd);
4836 	emit_byte(0x04);
4837 	emit_byte(0x24); /* fld load r as DOUBLE from [esp] */
4838 	emit_byte(0x9b); /* let the CPU wait on FPU exceptions */
4839 	REX64
4840 	emit_byte(0x83);
4841 	emit_byte(0xc4);
4842 	emit_byte(0x08); /* add +8 to esp */
4843 }
4844 LENDFUNC(NONE,NONE,1,raw_fcut_r,(FRW r))
4845 
4846 LOWFUNC(NONE,NONE,2,raw_fgetexp_rr,(FW d, FR s))
4847 {
4848 	int ds;
4849 
4850 	if (d!=s) {
4851 		ds=stackpos(s);
4852 		emit_byte(0xd9);
4853 		emit_byte(0xc0+ds); /* fld x */
4854 		emit_byte(0xd9);
4855 		emit_byte(0xf4);    /* fxtract exp push man */
4856 		emit_byte(0xdd);
4857 		emit_byte(0xd8);    /* fstp just pop man */
4858 		tos_make(d);        /* store exp to destination */
4859 	}
4860 	else {
4861 		make_tos(d);        /* tos=x=y */
4862 		emit_byte(0xd9);
4863 		emit_byte(0xf4);    /* fxtract exp push man */
4864 		emit_byte(0xdd);
4865 		emit_byte(0xd8);    /* fstp just pop man */
4866 	}
4867 }
4868 LENDFUNC(NONE,NONE,2,raw_fgetexp_rr,(FW d, FR s))
4869 
4870 LOWFUNC(NONE,NONE,2,raw_fgetman_rr,(FW d, FR s))
4871 {
4872 	int ds;
4873 
4874 	if (d!=s) {
4875 		ds=stackpos(s);
4876 		emit_byte(0xd9);
4877 		emit_byte(0xc0+ds); /* fld x */
4878 		emit_byte(0xd9);
4879 		emit_byte(0xf4);    /* fxtract exp push man */
4880 		emit_byte(0xdd);
4881 		emit_byte(0xd9);    /* fstp copy man up & pop */
4882 		tos_make(d);        /* store man to destination */
4883 	}
4884 	else {
4885 		make_tos(d);        /* tos=x=y */
4886 		emit_byte(0xd9);
4887 		emit_byte(0xf4);    /* fxtract exp push man */
4888 		emit_byte(0xdd);
4889 		emit_byte(0xd9);    /* fstp copy man up & pop */
4890 	}
4891 }
4892 LENDFUNC(NONE,NONE,2,raw_fgetman_rr,(FW d, FR s))
4893 
4894 LOWFUNC(NONE,NONE,2,raw_flogN_rr,(FW d, FR s))
4895 {
4896 	int ds;
4897 
4898 	if (s==d)
4899 		make_tos(s);
4900 	else {
4901 		ds=stackpos(s);
4902 		emit_byte(0xd9);
4903 		emit_byte(0xc0+ds); /* fld x */
4904 	}
4905 	emit_byte(0xd9);
4906 	emit_byte(0xed);    /* fldln2 logN(2) */
4907 	emit_byte(0xd9);
4908 	emit_byte(0xc9);    /* fxch swap logN(2) with x */
4909 	emit_byte(0xd9);
4910 	emit_byte(0xf1);    /* fyl2x logN(2)*log2(x) */
4911 	if (s!=d)
4912 		tos_make(d);    /* store y=logN(x) */
4913 }
4914 LENDFUNC(NONE,NONE,2,raw_flogN_rr,(FW d, FR s))
4915 
4916 LOWFUNC(NONE,NONE,2,raw_flogNP1_rr,(FW d, FR s))
4917 {
4918 	int ds;
4919 
4920 	if (s==d)
4921 		make_tos(s);
4922 	else {
4923 		ds=stackpos(s);
4924 		emit_byte(0xd9);
4925 		emit_byte(0xc0+ds); /* fld x */
4926 	}
4927 	emit_byte(0xd9);
4928 	emit_byte(0xed);    /* fldln2 logN(2) */
4929 	emit_byte(0xd9);
4930 	emit_byte(0xc9);    /* fxch swap logN(2) with x */
4931 	emit_byte(0xd9);
4932 	emit_byte(0xf9);    /* fyl2xp1 logN(2)*log2(x+1) */
4933 	if (s!=d)
4934 		tos_make(d);    /* store y=logN(x+1) */
4935 }
4936 LENDFUNC(NONE,NONE,2,raw_flogNP1_rr,(FW d, FR s))
4937 
4938 LOWFUNC(NONE,NONE,2,raw_flog10_rr,(FW d, FR s))
4939 {
4940 	int ds;
4941 
4942 	if (s==d)
4943 		make_tos(s);
4944 	else {
4945 		ds=stackpos(s);
4946 		emit_byte(0xd9);
4947 		emit_byte(0xc0+ds); /* fld x */
4948 	}
4949 	emit_byte(0xd9);
4950 	emit_byte(0xec);    /* fldlg2 log10(2) */
4951 	emit_byte(0xd9);
4952 	emit_byte(0xc9);    /* fxch swap log10(2) with x */
4953 	emit_byte(0xd9);
4954 	emit_byte(0xf1);    /* fyl2x log10(2)*log2(x) */
4955 	if (s!=d)
4956 		tos_make(d);    /* store y=log10(x) */
4957 }
4958 LENDFUNC(NONE,NONE,2,raw_flog10_rr,(FW d, FR s))
4959 
4960 LOWFUNC(NONE,NONE,2,raw_fasin_rr,(FW d, FR s))
4961 {
4962 	int ds;
4963 
4964 	ds=stackpos(s);
4965 	emit_byte(0xd9);
4966 	emit_byte(0xc0+ds); /* fld x */
4967 	emit_byte(0xd8);
4968 	emit_byte(0xc8);    /* fmul x*x */
4969 	emit_byte(0xd9);
4970 	emit_byte(0xe8);    /* fld 1.0 */
4971 	emit_byte(0xde);
4972 	emit_byte(0xe1);    /* fsubrp 1 - (x^2) */
4973 	emit_byte(0xd9);
4974 	emit_byte(0xfa);    /* fsqrt sqrt(1-(x^2)) */
4975 	emit_byte(0xd9);
4976 	emit_byte(0xc1+ds); /* fld x again */
4977 	emit_byte(0xd9);
4978 	emit_byte(0xc9);    /* fxch swap x with sqrt(1-(x^2))  */
4979 	emit_byte(0xd9);
4980 	emit_byte(0xf3);    /* fpatan atan(x/sqrt(1-(x^2))) & pop */
4981 	tos_make(d);        /* store y=asin(x) */
4982 }
4983 LENDFUNC(NONE,NONE,2,raw_fasin_rr,(FW d, FR s))
4984 
4985 static uae_u32 pihalf[] = {0x2168c234, 0xc90fdaa2, 0x3fff}; // LSB=0 to get acos(1)=0
4986 
4987 LOWFUNC(NONE,NONE,2,raw_facos_rr,(FW d, FR s))
4988 {
4989 	int ds;
4990 
4991 	ds=stackpos(s);
4992 	emit_byte(0xd9);
4993 	emit_byte(0xc0+ds); /* fld x */
4994 	emit_byte(0xd8);
4995 	emit_byte(0xc8);    /* fmul x*x */
4996 	emit_byte(0xd9);
4997 	emit_byte(0xe8);    /* fld 1.0 */
4998 	emit_byte(0xde);
4999 	emit_byte(0xe1);    /* fsubrp 1 - (x^2) */
5000 	emit_byte(0xd9);
5001 	emit_byte(0xfa);    /* fsqrt sqrt(1-(x^2)) */
5002 	emit_byte(0xd9);
5003 	emit_byte(0xc1+ds); /* fld x again */
5004 	emit_byte(0xd9);
5005 	emit_byte(0xc9);    /* fxch swap x with sqrt(1-(x^2))  */
5006 	emit_byte(0xd9);
5007 	emit_byte(0xf3);    /* fpatan atan(x/sqrt(1-(x^2))) & pop */
5008 	raw_fldt((uintptr) &pihalf); /* fld load pi/2 from pihalf */
5009 	emit_byte(0xde);
5010 	emit_byte(0xe1);    /* fsubrp pi/2 - asin(x) & pop */
5011 	tos_make(d);        /* store y=acos(x) */
5012 }
5013 LENDFUNC(NONE,NONE,2,raw_facos_rr,(FW d, FR s))
5014 
5015 LOWFUNC(NONE,NONE,2,raw_fatan_rr,(FW d, FR s))
5016 {
5017 	int ds;
5018 
5019 	if (s==d)
5020 		make_tos(s);
5021 	else {
5022 		ds=stackpos(s);
5023 		emit_byte(0xd9);
5024 		emit_byte(0xc0+ds); /* fld x */
5025 	}
5026 	emit_byte(0xd9);
5027 	emit_byte(0xe8);    /* fld 1.0 */
5028 	emit_byte(0xd9);
5029 	emit_byte(0xf3);    /* fpatan atan(x)/1  & pop*/
5030 	if (s!=d)
5031 		tos_make(d);    /* store y=atan(x) */
5032 }
5033 LENDFUNC(NONE,NONE,2,raw_fatan_rr,(FW d, FR s))
5034 
5035 LOWFUNC(NONE,NONE,2,raw_fatanh_rr,(FW d, FR s))
5036 {
5037 	int ds;
5038 
5039 	ds=stackpos(s);
5040 	emit_byte(0xd9);
5041 	emit_byte(0xc0+ds); /* fld x */
5042 	emit_byte(0xd9);
5043 	emit_byte(0xe8);    /* fld 1.0 */
5044 	emit_byte(0xdc);
5045 	emit_byte(0xc1);    /* fadd 1 + x */
5046 	emit_byte(0xd8);
5047 	emit_byte(0xe2+ds); /* fsub 1 - x */
5048 	emit_byte(0xde);
5049 	emit_byte(0xf9);    /* fdivp (1+x)/(1-x) */
5050 	emit_byte(0xd9);
5051 	emit_byte(0xed);    /* fldl2e logN(2) */
5052 	emit_byte(0xd9);
5053 	emit_byte(0xc9);    /* fxch swap logN(2) with (1+x)/(1-x) */
5054 	emit_byte(0xd9);
5055 	emit_byte(0xf1);    /* fyl2x logN(2)*log2((1+x)/(1-x)) pop */
5056 	emit_byte(0xd9);
5057 	emit_byte(0xe8);    /* fld 1.0 */
5058 	emit_byte(0xd9);
5059 	emit_byte(0xe0);    /* fchs -1.0 */
5060 	emit_byte(0xd9);
5061 	emit_byte(0xc9);    /* fxch swap */
5062 	emit_byte(0xd9);
5063 	emit_byte(0xfd);    /* fscale logN((1+x)/(1-x)) * 2^(-1) */
5064 	emit_byte(0xdd);
5065 	emit_byte(0xd9);    /* fstp copy & pop */
5066 	tos_make(d);        /* store y=atanh(x) */
5067 }
5068 LENDFUNC(NONE,NONE,2,raw_fatanh_rr,(FW d, FR s))
5069 
5070 LOWFUNC(NONE,NONE,2,raw_fsinh_rr,(FW d, FR s))
5071 {
5072 	int ds,tr;
5073 
5074 	tr=live.onstack[live.tos+3];
5075 	if (s==d)
5076 		make_tos(s);
5077 	else {
5078 		ds=stackpos(s);
5079 		emit_byte(0xd9);
5080 		emit_byte(0xc0+ds); /* fld x */
5081 	}
5082 	emit_byte(0xd9);
5083 	emit_byte(0xea);     /* fldl2e log2(e) */
5084 	emit_byte(0xd8);
5085 	emit_byte(0xc9);     /* fmul x*log2(e) */
5086 	emit_byte(0xdd);
5087 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5088 	if (tr>=0) {
5089 		emit_byte(0xd9);
5090 		emit_byte(0xca); /* fxch swap with temp-reg */
5091 		REX64
5092 		emit_byte(0x83);
5093 		emit_byte(0xc4);
5094 		emit_byte(0xf4); /* add -12 to esp */
5095 		emit_byte(0xdb);
5096 		emit_byte(0x3c);
5097 		emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
5098 	}
5099 	emit_byte(0xd9);
5100 	emit_byte(0xe0);     /* fchs -x*log2(e) */
5101 	emit_byte(0xd9);
5102 	emit_byte(0xc0);     /* fld -x*log2(e) again */
5103 	emit_byte(0xd9);
5104 	emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
5105 	emit_byte(0xd9);
5106 	emit_byte(0xc9);     /* fxch swap */
5107 	emit_byte(0xd8);
5108 	emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
5109 	emit_byte(0xd9);
5110 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5111 	x86_fadd_m((uintptr) &one);
5112 	emit_byte(0xd9);
5113 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5114 	emit_byte(0xd9);
5115 	emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) in tr */
5116 	emit_byte(0xdd);
5117 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5118 	emit_byte(0xd9);
5119 	emit_byte(0xfc);     /* frndint int(x*log2(e)) */
5120 	emit_byte(0xd9);
5121 	emit_byte(0xc9);     /* fxch swap */
5122 	emit_byte(0xd8);
5123 	emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
5124 	emit_byte(0xd9);
5125 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5126 	x86_fadd_m((uintptr) &one);
5127 	emit_byte(0xd9);
5128 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5129 	emit_byte(0xdd);
5130 	emit_byte(0xd9);     /* fstp copy e^x & pop */
5131 	if (tr>=0) {
5132 		emit_byte(0xdb);
5133 		emit_byte(0x2c);
5134 		emit_byte(0x24); /* fld load temp-reg from [esp] */
5135 		emit_byte(0xd9);
5136 		emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
5137 		emit_byte(0xde);
5138 		emit_byte(0xe9); /* fsubp (e^x)-(e^-x) */
5139 		REX64
5140 		emit_byte(0x83);
5141 		emit_byte(0xc4);
5142 		emit_byte(0x0c); /* delayed add +12 to esp */
5143 	}
5144 	else {
5145 		emit_byte(0xde);
5146 		emit_byte(0xe1); /* fsubrp (e^x)-(e^-x) */
5147 	}
5148 	emit_byte(0xd9);
5149 	emit_byte(0xe8);     /* fld 1.0 */
5150 	emit_byte(0xd9);
5151 	emit_byte(0xe0);     /* fchs -1.0 */
5152 	emit_byte(0xd9);
5153 	emit_byte(0xc9);     /* fxch swap */
5154 	emit_byte(0xd9);
5155 	emit_byte(0xfd);     /* fscale ((e^x)-(e^-x))/2 */
5156 	emit_byte(0xdd);
5157 	emit_byte(0xd9);     /* fstp copy & pop */
5158 	if (s!=d)
5159 		tos_make(d);     /* store y=sinh(x) */
5160 }
5161 LENDFUNC(NONE,NONE,2,raw_fsinh_rr,(FW d, FR s))
5162 
5163 LOWFUNC(NONE,NONE,2,raw_fcosh_rr,(FW d, FR s))
5164 {
5165 	int ds,tr;
5166 
5167 	tr=live.onstack[live.tos+3];
5168 	if (s==d)
5169 		make_tos(s);
5170 	else {
5171 		ds=stackpos(s);
5172 		emit_byte(0xd9);
5173 		emit_byte(0xc0+ds); /* fld x */
5174 	}
5175 	emit_byte(0xd9);
5176 	emit_byte(0xea);     /* fldl2e log2(e) */
5177 	emit_byte(0xd8);
5178 	emit_byte(0xc9);     /* fmul x*log2(e) */
5179 	emit_byte(0xdd);
5180 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5181 	if (tr>=0) {
5182 		emit_byte(0xd9);
5183 		emit_byte(0xca); /* fxch swap with temp-reg */
5184 		REX64
5185 		emit_byte(0x83);
5186 		emit_byte(0xc4);
5187 		emit_byte(0xf4); /* add -12 to esp */
5188 		emit_byte(0xdb);
5189 		emit_byte(0x3c);
5190 		emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
5191 	}
5192 	emit_byte(0xd9);
5193 	emit_byte(0xe0);     /* fchs -x*log2(e) */
5194 	emit_byte(0xd9);
5195 	emit_byte(0xc0);     /* fld -x*log2(e) again */
5196 	emit_byte(0xd9);
5197 	emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
5198 	emit_byte(0xd9);
5199 	emit_byte(0xc9);     /* fxch swap */
5200 	emit_byte(0xd8);
5201 	emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
5202 	emit_byte(0xd9);
5203 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5204 	x86_fadd_m((uintptr) &one);
5205 	emit_byte(0xd9);
5206 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5207 	emit_byte(0xd9);
5208 	emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) in tr */
5209 	emit_byte(0xdd);
5210 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5211 	emit_byte(0xd9);
5212 	emit_byte(0xfc);     /* frndint int(x*log2(e)) */
5213 	emit_byte(0xd9);
5214 	emit_byte(0xc9);     /* fxch swap */
5215 	emit_byte(0xd8);
5216 	emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
5217 	emit_byte(0xd9);
5218 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5219 	x86_fadd_m((uintptr) &one);
5220 	emit_byte(0xd9);
5221 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5222 	emit_byte(0xdd);
5223 	emit_byte(0xd9);     /* fstp copy e^x & pop */
5224 	if (tr>=0) {
5225 		emit_byte(0xdb);
5226 		emit_byte(0x2c);
5227 		emit_byte(0x24); /* fld load temp-reg from [esp] */
5228 		emit_byte(0xd9);
5229 		emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
5230 		REX64
5231 		emit_byte(0x83);
5232 		emit_byte(0xc4);
5233 		emit_byte(0x0c); /* delayed add +12 to esp */
5234 	}
5235 	emit_byte(0xde);
5236 	emit_byte(0xc1);     /* faddp (e^x)+(e^-x) */
5237 	emit_byte(0xd9);
5238 	emit_byte(0xe8);     /* fld 1.0 */
5239 	emit_byte(0xd9);
5240 	emit_byte(0xe0);     /* fchs -1.0 */
5241 	emit_byte(0xd9);
5242 	emit_byte(0xc9);     /* fxch swap */
5243 	emit_byte(0xd9);
5244 	emit_byte(0xfd);     /* fscale ((e^x)+(e^-x))/2 */
5245 	emit_byte(0xdd);
5246 	emit_byte(0xd9);     /* fstp copy & pop */
5247 	if (s!=d)
5248 		tos_make(d);     /* store y=cosh(x) */
5249 }
5250 LENDFUNC(NONE,NONE,2,raw_fcosh_rr,(FW d, FR s))
5251 
5252 LOWFUNC(NONE,NONE,2,raw_ftanh_rr,(FW d, FR s))
5253 {
5254 	int ds,tr;
5255 
5256 	tr=live.onstack[live.tos+3];
5257 	if (s==d)
5258 		make_tos(s);
5259 	else {
5260 		ds=stackpos(s);
5261 		emit_byte(0xd9);
5262 		emit_byte(0xc0+ds); /* fld x */
5263 	}
5264 	emit_byte(0xd9);
5265 	emit_byte(0xea);     /* fldl2e log2(e) */
5266 	emit_byte(0xd8);
5267 	emit_byte(0xc9);     /* fmul x*log2(e) */
5268 	emit_byte(0xdd);
5269 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5270 	if (tr>=0) {
5271 		emit_byte(0xd9);
5272 		emit_byte(0xca); /* fxch swap with temp-reg */
5273 		REX64
5274 		emit_byte(0x83);
5275 		emit_byte(0xc4);
5276 		emit_byte(0xf4); /* add -12 to esp */
5277 		emit_byte(0xdb);
5278 		emit_byte(0x3c);
5279 		emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
5280 	}
5281 	emit_byte(0xd9);
5282 	emit_byte(0xe0);     /* fchs -x*log2(e) */
5283 	emit_byte(0xd9);
5284 	emit_byte(0xc0);     /* fld -x*log2(e) again */
5285 	emit_byte(0xd9);
5286 	emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
5287 	emit_byte(0xd9);
5288 	emit_byte(0xc9);     /* fxch swap */
5289 	emit_byte(0xd8);
5290 	emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
5291 	emit_byte(0xd9);
5292 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5293 	x86_fadd_m((uintptr) &one);
5294 	emit_byte(0xd9);
5295 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5296 	emit_byte(0xd9);
5297 	emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) */
5298 	emit_byte(0xdd);
5299 	emit_byte(0xd1);     /* fst copy x*log2(e) */
5300 	emit_byte(0xd9);
5301 	emit_byte(0xfc);     /* frndint int(x*log2(e)) */
5302 	emit_byte(0xd9);
5303 	emit_byte(0xc9);     /* fxch swap */
5304 	emit_byte(0xd8);
5305 	emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
5306 	emit_byte(0xd9);
5307 	emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
5308 	x86_fadd_m((uintptr) &one);
5309 	emit_byte(0xd9);
5310 	emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
5311 	emit_byte(0xdd);
5312 	emit_byte(0xd1);     /* fst copy e^x */
5313 	emit_byte(0xd8);
5314 	emit_byte(0xc2);     /* fadd (e^x)+(e^-x) */
5315 	emit_byte(0xd9);
5316 	emit_byte(0xca);     /* fxch swap with e^-x */
5317 	emit_byte(0xde);
5318 	emit_byte(0xe9);     /* fsubp (e^x)-(e^-x) */
5319 	if (tr>=0) {
5320 		emit_byte(0xdb);
5321 		emit_byte(0x2c);
5322 		emit_byte(0x24); /* fld load temp-reg from [esp] */
5323 		emit_byte(0xd9);
5324 		emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
5325 		emit_byte(0xde);
5326 		emit_byte(0xf9); /* fdivp ((e^x)-(e^-x))/((e^x)+(e^-x)) */
5327 		REX64
5328 		emit_byte(0x83);
5329 		emit_byte(0xc4);
5330 		emit_byte(0x0c); /* delayed add +12 to esp */
5331 	}
5332 	else {
5333 		emit_byte(0xde);
5334 		emit_byte(0xf1); /* fdivrp ((e^x)-(e^-x))/((e^x)+(e^-x)) */
5335 	}
5336 	if (s!=d)
5337 		tos_make(d);     /* store y=tanh(x) */
5338 }
5339 LENDFUNC(NONE,NONE,2,raw_ftanh_rr,(FW d, FR s))
5340 
5341 /* %eax register is clobbered if target processor doesn't support fucomi */
5342 #define FFLAG_NREG_CLOBBER_CONDITION !have_cmov
5343 #define FFLAG_NREG EAX_INDEX
5344 
raw_fflags_into_flags(int r)5345 static inline void raw_fflags_into_flags(int r)
5346 {
5347 	int p;
5348 
5349 	usereg(r);
5350 	p=stackpos(r);
5351 
5352 	emit_byte(0xd9);
5353 	emit_byte(0xee); /* Push 0 */
5354 	emit_byte(0xd9);
5355 	emit_byte(0xc9+p); /* swap top two around */
5356 	if (have_cmov) {
5357 		// gb-- fucomi is for P6 cores only, not K6-2 then...
5358 		emit_byte(0xdb);
5359 		emit_byte(0xe9+p); /* fucomi them */
5360 	}
5361 	else {
5362 		emit_byte(0xdd);
5363 		emit_byte(0xe1+p); /* fucom them */
5364 		emit_byte(0x9b);
5365 		emit_byte(0xdf);
5366 		emit_byte(0xe0); /* fstsw ax */
5367 		raw_sahf(0); /* sahf */
5368 	}
5369 	emit_byte(0xdd);
5370 	emit_byte(0xd9+p);  /* store value back, and get rid of 0 */
5371 }
5372